diff --git a/.appveyor.yml b/.appveyor.yml
index 4cff03d571a1..58a536d17a98 100644
--- a/.appveyor.yml
+++ b/.appveyor.yml
@@ -1,4 +1,4 @@
-version: 4.1.0.99.{build}
+version: 4.2.0.99.{build}
 
 image: Visual Studio 2015
 platform: x64
diff --git a/.ci/check_python_dists.sh b/.ci/check_python_dists.sh
index cb0bbae79fa9..1dd19679daae 100644
--- a/.ci/check_python_dists.sh
+++ b/.ci/check_python_dists.sh
@@ -25,7 +25,7 @@ if [ $PY_MINOR_VER -gt 7 ]; then
         pydistcheck \
             --inspect \
             --ignore 'compiled-objects-have-debug-symbols,distro-too-large-compressed' \
-            --max-allowed-size-uncompressed '60M' \
+            --max-allowed-size-uncompressed '100M' \
             --max-allowed-files 800 \
             ${DIST_DIR}/* || exit -1
     elif { test $(uname -m) = "aarch64"; }; then
diff --git a/.ci/lint-cpp.sh b/.ci/lint-cpp.sh
index ef9fff683731..2d91f8e85f00 100755
--- a/.ci/lint-cpp.sh
+++ b/.ci/lint-cpp.sh
@@ -18,3 +18,27 @@ cmakelint \
     ${cmake_files} \
 || exit -1
 echo "done running cmakelint"
+
+echo "checking that all OpenMP pragmas specify num_threads()"
+get_omp_pragmas_without_num_threads() {
+    grep \
+        -n \
+        -R \
+        --include='*.c' \
+        --include='*.cc' \
+        --include='*.cpp' \
+        --include='*.h' \
+        --include='*.hpp' \
+        'pragma omp parallel' \
+    | grep -v ' num_threads'
+}
+PROBLEMATIC_LINES=$(
+    get_omp_pragmas_without_num_threads
+)
+if test "${PROBLEMATIC_LINES}" != ""; then
+    get_omp_pragmas_without_num_threads
+    echo "Found '#pragma omp parallel' not using explicit num_threads() configuration. Fix those."
+    echo "For details, see https://www.openmp.org/spec-html/5.0/openmpse14.html#x54-800002.6"
+    exit -1
+fi
+echo "done checking OpenMP pragmas"
diff --git a/.ci/lint_r_code.R b/.ci/lint_r_code.R
index 12116104ef6d..d5b1217a5b04 100755
--- a/.ci/lint_r_code.R
+++ b/.ci/lint_r_code.R
@@ -1,5 +1,5 @@
 
-library(lintr)
+loadNamespace("lintr")
 
 args <- commandArgs(
     trailingOnly = TRUE
@@ -33,30 +33,34 @@ LINTERS_TO_USE <- list(
     , "any_duplicated"       = lintr::any_duplicated_linter()
     , "any_is_na"            = lintr::any_is_na_linter()
     , "assignment"           = lintr::assignment_linter()
+    , "backport"             = lintr::backport_linter()
     , "boolean_arithmetic"   = lintr::boolean_arithmetic_linter()
     , "braces"               = lintr::brace_linter()
     , "class_equals"         = lintr::class_equals_linter()
     , "commas"               = lintr::commas_linter()
+    , "conjunct_test"        = lintr::conjunct_test_linter()
     , "duplicate_argument"   = lintr::duplicate_argument_linter()
     , "empty_assignment"     = lintr::empty_assignment_linter()
     , "equals_na"            = lintr::equals_na_linter()
+    , "fixed_regex"          = lintr::fixed_regex_linter()
     , "for_loop_index"       = lintr::for_loop_index_linter()
     , "function_left"        = lintr::function_left_parentheses_linter()
+    , "function_return"      = lintr::function_return_linter()
+    , "implicit_assignment"  = lintr::implicit_assignment_linter()
     , "implicit_integers"    = lintr::implicit_integer_linter()
     , "infix_spaces"         = lintr::infix_spaces_linter()
     , "inner_combine"        = lintr::inner_combine_linter()
     , "is_numeric"           = lintr::is_numeric_linter()
-    , "fixed_regex"          = lintr::fixed_regex_linter()
-    , "function_return"      = lintr::function_return_linter()
     , "lengths"              = lintr::lengths_linter()
+    , "line_length"          = lintr::line_length_linter(length = 120L)
     , "literal_coercion"     = lintr::literal_coercion_linter()
-    , "long_lines"           = lintr::line_length_linter(length = 120L)
     , "matrix"               = lintr::matrix_apply_linter()
     , "missing_argument"     = lintr::missing_argument_linter()
     , "non_portable_path"    = lintr::nonportable_path_linter()
     , "numeric_leading_zero" = lintr::numeric_leading_zero_linter()
     , "outer_negation"       = lintr::outer_negation_linter()
     , "package_hooks"        = lintr::package_hooks_linter()
+    , "paren_body"           = lintr::paren_body_linter()
     , "paste"                = lintr::paste_linter()
     , "quotes"               = lintr::quotes_linter()
     , "redundant_equals"     = lintr::redundant_equals_linter()
@@ -74,8 +78,7 @@ LINTERS_TO_USE <- list(
     , "true_false"           = lintr::T_and_F_symbol_linter()
     , "undesirable_function" = lintr::undesirable_function_linter(
         fun = c(
-            "cat" = "CRAN forbids the use of cat() in packages except in special cases. Use message() or warning()."
-            , "cbind" = paste0(
+            "cbind" = paste0(
                 "cbind is an unsafe way to build up a data frame. merge() or direct "
                 , "column assignment is preferred."
             )
@@ -100,6 +103,7 @@ LINTERS_TO_USE <- list(
             "%>%" = pipe_text
             , "%.%" = pipe_text
             , "%..%" = pipe_text
+            , "|>" = pipe_text
             , "?" = interactive_text
             , "??" = interactive_text
         )
@@ -107,6 +111,7 @@ LINTERS_TO_USE <- list(
     , "unnecessary_concatenation" = lintr::unnecessary_concatenation_linter()
     , "unnecessary_lambda"        = lintr::unnecessary_lambda_linter()
     , "unreachable_code"          = lintr::unreachable_code_linter()
+    , "unused_import"             = lintr::unused_import_linter()
     , "vector_logic"              = lintr::vector_logic_linter()
     , "whitespace"                = lintr::whitespace_linter()
 )
diff --git a/.ci/setup.sh b/.ci/setup.sh
index f7da21286d7d..bc17fee03308 100755
--- a/.ci/setup.sh
+++ b/.ci/setup.sh
@@ -54,6 +54,14 @@ else  # Linux
             sudo apt-get install --no-install-recommends -y \
                 clang \
                 libomp-dev
+        elif [[ $COMPILER == "clang-17" ]]; then
+            sudo apt-get install wget
+            wget -qO- https://apt.llvm.org/llvm-snapshot.gpg.key | sudo tee /etc/apt/trusted.gpg.d/apt.llvm.org.asc
+            sudo apt-add-repository deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-17 main
+            sudo apt-add-repository deb-src http://apt.llvm.org/jammy/ llvm-toolchain-jammy-17 main
+            sudo apt-get update
+            sudo apt-get install -y clang-17
+            sudo apt-get install --no-install-recommends -y libomp-17-dev
         fi
 
         export LANG="en_US.UTF-8"
diff --git a/.ci/test-python-oldest.sh b/.ci/test-python-oldest.sh
index 3a0ea08dddda..40dfd393f1fe 100644
--- a/.ci/test-python-oldest.sh
+++ b/.ci/test-python-oldest.sh
@@ -7,9 +7,11 @@
 #
 echo "installing lightgbm's dependencies"
 pip install \
+  'cffi==1.15.1' \
   'dataclasses' \
-  'numpy==1.12.0' \
+  'numpy==1.16.6' \
   'pandas==0.24.0' \
+  'pyarrow==6.0.1' \
   'scikit-learn==0.18.2' \
   'scipy==0.19.0' \
 || exit -1
diff --git a/.ci/test.sh b/.ci/test.sh
index af7cae2e3858..472fd7d8c6e5 100755
--- a/.ci/test.sh
+++ b/.ci/test.sh
@@ -6,6 +6,9 @@ if [[ $OS_NAME == "macos" ]] && [[ $COMPILER == "gcc" ]]; then
 elif [[ $OS_NAME == "linux" ]] && [[ $COMPILER == "clang" ]]; then
     export CXX=clang++
     export CC=clang
+elif [[ $OS_NAME == "linux" ]] && [[ $COMPILER == "clang-17" ]]; then
+    export CXX=clang++-17
+    export CC=clang-17
 fi
 
 if [[ $IN_UBUNTU_BASE_CONTAINER == "true" ]]; then
@@ -37,7 +40,7 @@ fi
 CONDA_PYTHON_REQUIREMENT="python=$PYTHON_VERSION[build=*cpython]"
 
 if [[ $TASK == "if-else" ]]; then
-    conda create -q -y -n $CONDA_ENV ${CONDA_PYTHON_REQUIREMENT} numpy
+    mamba create -q -y -n $CONDA_ENV ${CONDA_PYTHON_REQUIREMENT} numpy
     source activate $CONDA_ENV
     mkdir $BUILD_DIRECTORY/build && cd $BUILD_DIRECTORY/build && cmake .. && make lightgbm -j4 || exit -1
     cd $BUILD_DIRECTORY/tests/cpp_tests && ../../lightgbm config=train.conf convert_model_language=cpp convert_model=../../src/boosting/gbdt_prediction.cpp && ../../lightgbm config=predict.conf output_result=origin.pred || exit -1
@@ -67,7 +70,7 @@ fi
 
 if [[ $TASK == "lint" ]]; then
     cd ${BUILD_DIRECTORY}
-    conda create -q -y -n $CONDA_ENV \
+    mamba create -q -y -n $CONDA_ENV \
         ${CONDA_PYTHON_REQUIREMENT} \
         cmakelint \
         cpplint \
@@ -87,10 +90,10 @@ fi
 
 if [[ $TASK == "check-docs" ]] || [[ $TASK == "check-links" ]]; then
     cd $BUILD_DIRECTORY/docs
-    conda env create \
+    mamba env create \
         -n $CONDA_ENV \
         --file ./env.yml || exit -1
-    conda install \
+    mamba install \
         -q \
         -y \
         -n $CONDA_ENV \
@@ -128,13 +131,15 @@ if [[ $PYTHON_VERSION == "3.7" ]]; then
 fi
 
 # including python=version[build=*cpython] to ensure that conda doesn't fall back to pypy
-conda create -q -y -n $CONDA_ENV \
+mamba create -q -y -n $CONDA_ENV \
     ${CONSTRAINED_DEPENDENCIES} \
+    cffi \
     cloudpickle \
     joblib \
     matplotlib \
     numpy \
     psutil \
+    pyarrow \
     pytest \
     ${CONDA_PYTHON_REQUIREMENT} \
     python-graphviz \
@@ -304,7 +309,7 @@ matplotlib.use\(\"Agg\"\)\
 ' plot_example.py  # prevent interactive window mode
     sed -i'.bak' 's/graph.render(view=True)/graph.render(view=False)/' plot_example.py
     # requirements for examples
-    conda install -q -y -n $CONDA_ENV \
+    mamba install -q -y -n $CONDA_ENV \
         h5py \
         ipywidgets \
         notebook
@@ -315,11 +320,13 @@ matplotlib.use\(\"Agg\"\)\
 
     # importing the library should succeed even if all optional dependencies are not present
     conda uninstall --force --yes \
+        cffi \
         dask \
         distributed \
         joblib \
         matplotlib \
         psutil \
+        pyarrow \
         python-graphviz \
         scikit-learn || exit -1
     python -c "import lightgbm" || exit -1
diff --git a/.ci/test_r_package.sh b/.ci/test_r_package.sh
index e4d70261aa36..6d421f16be56 100755
--- a/.ci/test_r_package.sh
+++ b/.ci/test_r_package.sh
@@ -127,13 +127,13 @@ if [[ "${R_MAJOR_VERSION}" == "3" ]]; then
     Rscript --vanilla -e "install.packages('https://cran.r-project.org/src/contrib/Archive/lattice/lattice_0.20-41.tar.gz', repos = NULL, lib = '${R_LIB_PATH}')"
 fi
 
-# Manually install Depends and Imports libraries + 'knitr', 'RhpcBLASctl', 'rmarkdown', 'testthat'
+# Manually install Depends and Imports libraries + 'knitr', 'markdown', 'RhpcBLASctl', 'testthat'
 # to avoid a CI-time dependency on devtools (for devtools::install_deps())
 # NOTE: testthat is not required when running rchk
 if [[ "${TASK}" == "r-rchk" ]]; then
-    packages="c('data.table', 'jsonlite', 'knitr', 'Matrix', 'R6', 'RhpcBLASctl', 'rmarkdown')"
+    packages="c('data.table', 'jsonlite', 'knitr', 'markdown', 'Matrix', 'R6', 'RhpcBLASctl')"
 else
-    packages="c('data.table', 'jsonlite', 'knitr', 'Matrix', 'R6', 'RhpcBLASctl', 'rmarkdown', 'testthat')"
+    packages="c('data.table', 'jsonlite', 'knitr', 'markdown', 'Matrix', 'R6', 'RhpcBLASctl', 'testthat')"
 fi
 compile_from_source="both"
 if [[ $OS_NAME == "macos" ]]; then
diff --git a/.ci/test_r_package_valgrind.sh b/.ci/test_r_package_valgrind.sh
index 9eddec2deed1..bd8f1f71f9f1 100755
--- a/.ci/test_r_package_valgrind.sh
+++ b/.ci/test_r_package_valgrind.sh
@@ -1,9 +1,11 @@
 #!/bin/bash
 
-RDscriptvalgrind -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'RhpcBLASctl', 'rmarkdown', 'testthat'), repos = 'https://cran.rstudio.com', Ncpus = parallel::detectCores())" || exit -1
+RDscriptvalgrind -e "install.packages(c('R6', 'data.table', 'jsonlite', 'Matrix', 'RhpcBLASctl', 'testthat'), repos = 'https://cran.rstudio.com')" || exit -1
 sh build-cran-package.sh \
   --r-executable=RDvalgrind \
+  --no-build-vignettes \
   || exit -1
+
 RDvalgrind CMD INSTALL --preclean --install-tests lightgbm_*.tar.gz || exit -1
 
 cd R-package/tests
@@ -68,7 +70,7 @@ bytes_possibly_lost=$(
     | tr -d ","
 )
 echo "valgrind found ${bytes_possibly_lost} bytes possibly lost"
-if [[ ${bytes_possibly_lost} -gt 352 ]]; then
+if [[ ${bytes_possibly_lost} -gt 1056 ]]; then
     exit -1
 fi
 
diff --git a/.ci/test_r_package_windows.ps1 b/.ci/test_r_package_windows.ps1
index e4d20de50b90..c4ae84f49dae 100644
--- a/.ci/test_r_package_windows.ps1
+++ b/.ci/test_r_package_windows.ps1
@@ -55,12 +55,14 @@ Remove-From-Path ".*\\R\\.*"
 Remove-From-Path ".*R Client.*"
 Remove-From-Path ".*rtools40.*"
 Remove-From-Path ".*rtools42.*"
+Remove-From-Path ".*rtools43.*"
 Remove-From-Path ".*shells.*"
 Remove-From-Path ".*Strawberry.*"
 Remove-From-Path ".*tools.*"
 
 Remove-Item C:\rtools40 -Force -Recurse -ErrorAction Ignore
 Remove-Item C:\rtools42 -Force -Recurse -ErrorAction Ignore
+Remove-Item C:\rtools43 -Force -Recurse -ErrorAction Ignore
 
 # Get details needed for installing R components
 #
@@ -76,11 +78,11 @@ if ($env:R_MAJOR_VERSION -eq "3") {
   $env:RTOOLS_EXE_FILE = "rtools35-x86_64.exe"
   $env:R_WINDOWS_VERSION = "3.6.3"
 } elseif ($env:R_MAJOR_VERSION -eq "4") {
-  $RTOOLS_INSTALL_PATH = "C:\rtools42"
+  $RTOOLS_INSTALL_PATH = "C:\rtools43"
   $env:RTOOLS_BIN = "$RTOOLS_INSTALL_PATH\usr\bin"
   $env:RTOOLS_MINGW_BIN = "$RTOOLS_INSTALL_PATH\x86_64-w64-mingw32.static.posix\bin"
-  $env:RTOOLS_EXE_FILE = "rtools42-5253-5107.exe"
-  $env:R_WINDOWS_VERSION = "4.2.2"
+  $env:RTOOLS_EXE_FILE = "rtools43-5550-5548.exe"
+  $env:R_WINDOWS_VERSION = "4.3.1"
 } else {
   Write-Output "[ERROR] Unrecognized R version: $env:R_VERSION"
   Check-Output $false
@@ -122,7 +124,7 @@ Start-Process -FilePath Rtools.exe -NoNewWindow -Wait -ArgumentList "/VERYSILENT
 Write-Output "Done installing Rtools"
 
 Write-Output "Installing dependencies"
-$packages = "c('data.table', 'jsonlite', 'knitr', 'Matrix', 'processx', 'R6', 'RhpcBLASctl', 'rmarkdown', 'testthat'), dependencies = c('Imports', 'Depends', 'LinkingTo')"
+$packages = "c('data.table', 'jsonlite', 'knitr', 'markdown', 'Matrix', 'processx', 'R6', 'RhpcBLASctl', 'testthat'), dependencies = c('Imports', 'Depends', 'LinkingTo')"
 Run-R-Code-Redirect-Stderr "options(install.packages.check.source = 'no'); install.packages($packages, repos = '$env:CRAN_MIRROR', type = 'binary', lib = '$env:R_LIB_PATH', Ncpus = parallel::detectCores())" ; Check-Output $?
 
 Write-Output "Building R package"
@@ -203,6 +205,19 @@ if ($env:COMPILER -ne "MSVC") {
   }
 }
 
+# Checking that the correct R version was used
+if ($env:TOOLCHAIN -ne "MSVC") {
+  $checks = Select-String -Path "${LOG_FILE_NAME}" -Pattern "using R version $env:R_WINDOWS_VERSION"
+  $checks_cnt = $checks.Matches.length
+} else {
+  $checks = Select-String -Path "${INSTALL_LOG_FILE_NAME}" -Pattern "R version passed into FindLibR.* $env:R_WINDOWS_VERSION"
+  $checks_cnt = $checks.Matches.length
+}
+if ($checks_cnt -eq 0) {
+  Write-Output "Wrong R version was found (expected '$env:R_WINDOWS_VERSION'). Check the build logs."
+  Check-Output $False
+}
+
 # Checking that we actually got the expected compiler. The R package has some logic
 # to fail back to MinGW if MSVC fails, but for CI builds we need to check that the correct
 # compiler was used.
diff --git a/.ci/test_windows.ps1 b/.ci/test_windows.ps1
index 413af821e065..6b02aed6ce8b 100644
--- a/.ci/test_windows.ps1
+++ b/.ci/test_windows.ps1
@@ -52,12 +52,14 @@ conda install brotlipy
 
 conda update -q -y conda
 conda create -q -y -n $env:CONDA_ENV `
+  cffi `
   cloudpickle `
   joblib `
   matplotlib `
   numpy `
   pandas `
   psutil `
+  pyarrow `
   pytest `
   "python=$env:PYTHON_VERSION[build=*cpython]" `
   python-graphviz `
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 02b5cfbdae23..b6885cad0503 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -7,4 +7,4 @@
 # offer a reasonable automatic best-guess
 
 # catch-all rule (this only gets matched if no rules below match)
-*    @guolinke @jameslamb @shiyu1994 @jmoralez
+*    @guolinke @jameslamb @shiyu1994 @jmoralez @borchero
diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
index 9ab8440a6625..3120fb1fae9f 100644
--- a/.github/workflows/cuda.yml
+++ b/.github/workflows/cuda.yml
@@ -31,17 +31,17 @@ jobs:
           - method: wheel
             compiler: gcc
             python_version: "3.11"
-            cuda_version: "11.7.1"
+            cuda_version: "11.8.0"
             task: cuda
           - method: source
             compiler: gcc
             python_version: "3.9"
-            cuda_version: "10.0"
+            cuda_version: "12.2.0"
             task: cuda
           - method: pip
             compiler: clang
             python_version: "3.10"
-            cuda_version: "11.7.1"
+            cuda_version: "11.8.0"
             task: cuda
     steps:
       - name: Setup or update software on host machine
@@ -98,8 +98,10 @@ jobs:
             cuda_version="${{ matrix.cuda_version }}"
             cuda_major=${cuda_version%%.*}
             docker_img="nvcr.io/nvidia/cuda:${cuda_version}-devel"
-            if [[ ${cuda_major} -gt 10 ]]; then
-                docker_img="${docker_img}-ubuntu$(lsb_release -rs)" 
+            if [[ ${cuda_major} -eq 11 ]]; then
+                docker_img="${docker_img}-ubuntu18.04"
+            elif [[ ${cuda_major} -ge 12 ]]; then
+                docker_img="${docker_img}-ubuntu20.04"
             fi
             docker run --env-file docker.env -v "$GITHUB_WORKSPACE":"$ROOT_DOCKER_FOLDER" --rm --gpus all "$docker_img" /bin/bash $ROOT_DOCKER_FOLDER/docker-script.sh
   all-cuda-jobs-successful:
diff --git a/.github/workflows/lock.yml b/.github/workflows/lock.yml
index 72d8b7c2f585..4efe658b7f45 100644
--- a/.github/workflows/lock.yml
+++ b/.github/workflows/lock.yml
@@ -18,15 +18,15 @@ jobs:
   action:
     runs-on: ubuntu-latest
     steps:
-      - uses: dessant/lock-threads@v4
+      - uses: dessant/lock-threads@v5
         with:
           github-token: ${{ github.token }}
           # after how many days of inactivity should a closed issue/PR be locked?
-          issue-inactive-days: '90'
-          pr-inactive-days: '90'
+          issue-inactive-days: '365'
+          pr-inactive-days: '365'
           # do not close feature request issues...
           # we close those but track them in https://github.com/microsoft/LightGBM/issues/2302
-          exclude-any-issue-labels: '"feature request"'
+          exclude-any-issue-labels: 'feature request'
           # what labels should be removed prior to locking?
           remove-issue-labels: 'awaiting response,awaiting review,blocking,in progress'
           remove-pr-labels: 'awaiting response,awaiting review,blocking,in progress'
@@ -42,3 +42,4 @@ jobs:
           # what shoulld the locking status be?
           issue-lock-reason: 'resolved'
           pr-lock-reason: 'resolved'
+          process-only: 'issues, prs'
diff --git a/.github/workflows/r_package.yml b/.github/workflows/r_package.yml
index 838528617143..4fa7f2ff8683 100644
--- a/.github/workflows/r_package.yml
+++ b/.github/workflows/r_package.yml
@@ -86,7 +86,7 @@ jobs:
             task: r-package
             compiler: MINGW
             toolchain: MSYS
-            r_version: 4.2
+            r_version: 4.3
             build_type: cmake
             container: null
           # Visual Studio 2019
@@ -102,7 +102,7 @@ jobs:
             task: r-package
             compiler: MSVC
             toolchain: MSVC
-            r_version: 4.2
+            r_version: 4.3
             build_type: cmake
             container: null
           ###############
@@ -119,7 +119,7 @@ jobs:
             task: r-package
             compiler: MINGW
             toolchain: MSYS
-            r_version: 4.2
+            r_version: 4.3
             build_type: cran
             container: null
           - os: ubuntu-latest
@@ -246,7 +246,7 @@ jobs:
       - name: Install packages
         shell: bash
         run: |
-          RDscript${{ matrix.r_customization }} -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'RhpcBLASctl', 'rmarkdown', 'testthat'), repos = 'https://cran.rstudio.com', Ncpus = parallel::detectCores())"
+          RDscript${{ matrix.r_customization }} -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'markdown', 'Matrix', 'RhpcBLASctl', 'testthat'), repos = 'https://cran.rstudio.com', Ncpus = parallel::detectCores())"
           sh build-cran-package.sh --r-executable=RD${{ matrix.r_customization }}
           RD${{ matrix.r_customization }} CMD INSTALL lightgbm_*.tar.gz || exit -1
       - name: Run tests with sanitizers
@@ -320,7 +320,7 @@ jobs:
         shell: bash
         run: |
           export PATH=/opt/R-devel/bin/:${PATH}
-          Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'RhpcBLASctl', 'rmarkdown', 'testthat'), repos = 'https://cran.rstudio.com', Ncpus = parallel::detectCores())"
+          Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'markdown', 'Matrix', 'RhpcBLASctl', 'testthat'), repos = 'https://cran.rstudio.com', Ncpus = parallel::detectCores())"
           sh build-cran-package.sh
           R CMD check --as-cran --run-donttest lightgbm_*.tar.gz || exit -1
           if grep -q -E "NOTE|WARNING|ERROR" lightgbm.Rcheck/00check.log; then
diff --git a/.github/workflows/static_analysis.yml b/.github/workflows/static_analysis.yml
index bf369e79c0c5..33d50b751197 100644
--- a/.github/workflows/static_analysis.yml
+++ b/.github/workflows/static_analysis.yml
@@ -66,7 +66,7 @@ jobs:
       - name: Install packages
         shell: bash
         run: |
-          Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'RhpcBLASctl', 'rmarkdown', 'roxygen2', 'testthat'), repos = 'https://cran.rstudio.com', Ncpus = parallel::detectCores())"
+          Rscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'markdown', 'Matrix', 'RhpcBLASctl', 'roxygen2', 'testthat'), repos = 'https://cran.rstudio.com', Ncpus = parallel::detectCores())"
           sh build-cran-package.sh || exit -1
           R CMD INSTALL --with-keep.source lightgbm_*.tar.gz || exit -1
       - name: Test documentation
diff --git a/.gitignore b/.gitignore
index bcf6f48b4cea..9403475cc190 100644
--- a/.gitignore
+++ b/.gitignore
@@ -266,7 +266,7 @@ _Pvt_Extensions
 *.out
 *.app
 /windows/LightGBM.VC.db
-lightgbm
+/lightgbm
 /testlightgbm
 
 # Created by https://www.gitignore.io/api/python
diff --git a/.vsts-ci.yml b/.vsts-ci.yml
index 8c4e3e6a4949..f3f3fd8dd1fe 100644
--- a/.vsts-ci.yml
+++ b/.vsts-ci.yml
@@ -34,7 +34,7 @@ jobs:
     SETUP_CONDA: 'false'
     OS_NAME: 'linux'
     PRODUCES_ARTIFACTS: 'true'
-  pool: sh-ubuntu
+  pool: sh-mariner
   container: linux-artifact-builder
   strategy:
     matrix:
@@ -82,12 +82,12 @@ jobs:
 - job: Linux_latest
 ###########################################
   variables:
-    COMPILER: clang
+    COMPILER: clang-17
     DEBIAN_FRONTEND: 'noninteractive'
     IN_UBUNTU_BASE_CONTAINER: 'true'
     OS_NAME: 'linux'
     SETUP_CONDA: 'true'
-  pool: sh-ubuntu
+  pool: sh-mariner
   container: ubuntu-latest
   strategy:
     matrix:
@@ -309,7 +309,7 @@ jobs:
       R_LIB_PATH=~/Rlib
       export R_LIBS=${R_LIB_PATH}
       mkdir -p ${R_LIB_PATH}
-      RDscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'RhpcBLASctl', 'rmarkdown'),  lib = '${R_LIB_PATH}', dependencies = c('Depends', 'Imports', 'LinkingTo'), repos = 'https://cran.rstudio.com', Ncpus = parallel::detectCores())" || exit -1
+      RDscript -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'markdown', 'Matrix', 'RhpcBLASctl'),  lib = '${R_LIB_PATH}', dependencies = c('Depends', 'Imports', 'LinkingTo'), repos = 'https://cran.rstudio.com', Ncpus = parallel::detectCores())" || exit -1
       sh build-cran-package.sh --r-executable=RD || exit -1
       mv lightgbm_${LGB_VER}.tar.gz $(Build.ArtifactStagingDirectory)/lightgbm-${LGB_VER}-r-cran.tar.gz
     displayName: 'Build CRAN R-package'
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6705ef130052..1ff289b9d045 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -27,20 +27,17 @@ if(APPLE)
   option(APPLE_OUTPUT_DYLIB "Output dylib shared library" OFF)
 endif()
 
-if(__INTEGRATE_OPENCL)
-  cmake_minimum_required(VERSION 3.11)
-elseif(USE_SWIG)
-  cmake_minimum_required(VERSION 3.8)
-elseif(USE_GPU OR APPLE)
-  cmake_minimum_required(VERSION 3.2)
-elseif(USE_CUDA)
-  cmake_minimum_required(VERSION 3.16)
-else()
-  cmake_minimum_required(VERSION 3.0)
-endif()
+cmake_minimum_required(VERSION 3.18)
 
 project(lightgbm LANGUAGES C CXX)
 
+if(BUILD_CPP_TEST)
+  set(CMAKE_CXX_STANDARD 14)
+else()
+  set(CMAKE_CXX_STANDARD 11)
+endif()
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
 list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake/modules")
 
 #-- Sanitizer
@@ -78,7 +75,6 @@ elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang")
   if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS "8.1.0")
     message(FATAL_ERROR "Insufficient AppleClang version")
   endif()
-  cmake_minimum_required(VERSION 3.16)
 elseif(MSVC)
   if(MSVC_VERSION LESS 1900)
     message(
@@ -86,7 +82,6 @@ elseif(MSVC)
       "The compiler ${CMAKE_CXX_COMPILER} doesn't support required C++11 features. Please use a newer MSVC."
     )
   endif()
-  cmake_minimum_required(VERSION 3.8)
 endif()
 
 if(USE_SWIG)
@@ -203,20 +198,26 @@ if(__INTEGRATE_OPENCL)
 endif()
 
 if(USE_CUDA)
-    find_package(CUDA 10.0 REQUIRED)
+    find_package(CUDA 11.0 REQUIRED)
     include_directories(${CUDA_INCLUDE_DIRS})
     set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler=${OpenMP_CXX_FLAGS} -Xcompiler=-fPIC -Xcompiler=-Wall")
 
-    set(CUDA_ARCHS "6.0" "6.1" "6.2" "7.0")
-    if(CUDA_VERSION VERSION_GREATER_EQUAL "10.0")
-        list(APPEND CUDA_ARCHS "7.5")
-    endif()
+    # reference for mapping of CUDA toolkit component versions to supported architectures ("compute capabilities"):
+    # https://en.wikipedia.org/wiki/CUDA#GPUs_supported
+    set(CUDA_ARCHS "6.0" "6.1" "6.2" "7.0" "7.5")
     if(CUDA_VERSION VERSION_GREATER_EQUAL "11.0")
         list(APPEND CUDA_ARCHS "8.0")
     endif()
     if(CUDA_VERSION VERSION_GREATER_EQUAL "11.1")
         list(APPEND CUDA_ARCHS "8.6")
     endif()
+    if(CUDA_VERSION VERSION_GREATER_EQUAL "11.5")
+        list(APPEND CUDA_ARCHS "8.7")
+    endif()
+    if(CUDA_VERSION VERSION_GREATER_EQUAL "11.8")
+        list(APPEND CUDA_ARCHS "8.9")
+        list(APPEND CUDA_ARCHS "9.0")
+    endif()
     list(POP_BACK CUDA_ARCHS CUDA_LAST_SUPPORTED_ARCH)
     list(APPEND CUDA_ARCHS "${CUDA_LAST_SUPPORTED_ARCH}+PTX")
     cuda_select_nvcc_arch_flags(CUDA_ARCH_FLAGS ${CUDA_ARCHS})
@@ -324,7 +325,7 @@ endif()
 if(UNIX OR MINGW OR CYGWIN)
   set(
     CMAKE_CXX_FLAGS
-    "${CMAKE_CXX_FLAGS} -std=c++11 -pthread -Wextra -Wall -Wno-ignored-attributes -Wno-unknown-pragmas -Wno-return-type"
+    "${CMAKE_CXX_FLAGS} -pthread -Wextra -Wall -Wno-ignored-attributes -Wno-unknown-pragmas -Wno-return-type"
   )
   if(MINGW)
     # ignore this warning: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95353
@@ -378,6 +379,13 @@ if(MSVC)
         CMAKE_CXX_FLAGS_RELWITHDEBINFO
     )
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W4 /MP")
+    if(__BUILD_FOR_R)
+        # MSVC does not like this commit:
+        # https://github.com/wch/r-source/commit/fb52ac1a610571fcb8ac92d886b9fefcffaa7d48
+        #
+        # and raises "error C3646: 'private_data_c': unknown override specifier"
+        add_definitions(-DR_LEGACY_RCOMPLEX)
+    endif()
     if(USE_DEBUG)
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Od")
     else()
@@ -420,7 +428,11 @@ file(
       src/objective/*.cpp
       src/network/*.cpp
       src/treelearner/*.cpp
-if(USE_CUDA)
+      src/utils/*.cpp
+)
+file(
+    GLOB
+    LGBM_CUDA_SOURCES
       src/treelearner/*.cu
       src/boosting/cuda/*.cpp
       src/boosting/cuda/*.cu
@@ -434,9 +446,12 @@ if(USE_CUDA)
       src/io/cuda/*.cpp
       src/cuda/*.cpp
       src/cuda/*.cu
-endif()
 )
 
+if(USE_CUDA)
+  list(APPEND SOURCES ${LGBM_CUDA_SOURCES})
+endif()
+
 add_library(lightgbm_objs OBJECT ${SOURCES})
 
 if(BUILD_CLI)
@@ -501,7 +516,18 @@ if(USE_SWIG)
             TARGET _lightgbm_swig
             POST_BUILD
             COMMAND "${Java_JAVAC_EXECUTABLE}" -d . java/*.java
-            COMMAND cp "${PROJECT_SOURCE_DIR}/Release/*.dll" com/microsoft/ml/lightgbm/windows/x86_64
+            COMMAND
+              "${CMAKE_COMMAND}"
+              -E
+              copy_if_different
+              "${PROJECT_SOURCE_DIR}/Release/lib_lightgbm.dll"
+              com/microsoft/ml/lightgbm/windows/x86_64
+            COMMAND
+              "${CMAKE_COMMAND}"
+              -E
+              copy_if_different
+              "${PROJECT_SOURCE_DIR}/Release/lib_lightgbm_swig.dll"
+              com/microsoft/ml/lightgbm/windows/x86_64
             COMMAND "${Java_JAR_EXECUTABLE}" -cf lightgbmlib.jar com
         )
     endif()
@@ -510,9 +536,16 @@ if(USE_SWIG)
         TARGET _lightgbm_swig
         POST_BUILD
         COMMAND "${Java_JAVAC_EXECUTABLE}" -d . java/*.java
-        COMMAND cp "${PROJECT_SOURCE_DIR}/*.dylib" com/microsoft/ml/lightgbm/osx/x86_64
         COMMAND
-          cp
+          "${CMAKE_COMMAND}"
+          -E
+          copy_if_different
+          "${PROJECT_SOURCE_DIR}/lib_lightgbm.dylib"
+          com/microsoft/ml/lightgbm/osx/x86_64
+        COMMAND
+          "${CMAKE_COMMAND}"
+          -E
+          copy_if_different
           "${PROJECT_SOURCE_DIR}/lib_lightgbm_swig.jnilib"
           com/microsoft/ml/lightgbm/osx/x86_64/lib_lightgbm_swig.dylib
         COMMAND "${Java_JAR_EXECUTABLE}" -cf lightgbmlib.jar com
@@ -522,7 +555,12 @@ if(USE_SWIG)
         TARGET _lightgbm_swig
         POST_BUILD
         COMMAND "${Java_JAVAC_EXECUTABLE}" -d . java/*.java
-        COMMAND cp "${PROJECT_SOURCE_DIR}/*.so" com/microsoft/ml/lightgbm/linux/x86_64
+        COMMAND
+          "${CMAKE_COMMAND}"
+          -E
+          copy_if_different
+          "${PROJECT_SOURCE_DIR}/lib_lightgbm.so"
+          com/microsoft/ml/lightgbm/linux/x86_64
         COMMAND "${Java_JAR_EXECUTABLE}" -cf lightgbmlib.jar com
     )
   endif()
@@ -617,7 +655,7 @@ if(BUILD_CPP_TEST)
     FetchContent_Declare(
       googletest
       GIT_REPOSITORY https://github.com/google/googletest.git
-      GIT_TAG        release-1.11.0
+      GIT_TAG        v1.14.0
     )
     FetchContent_MakeAvailable(googletest)
     add_library(GTest::GTest ALIAS gtest)
@@ -628,6 +666,7 @@ if(BUILD_CPP_TEST)
 
   file(GLOB CPP_TEST_SOURCES tests/cpp_tests/*.cpp)
   if(MSVC)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /permissive-")
     set(
       CompilerFlags
         CMAKE_CXX_FLAGS
diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION
index 1193c0d463b9..6fbeeee859be 100755
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -4,10 +4,10 @@ Title: Light Gradient Boosting Machine
 Version: ~~VERSION~~
 Date: ~~DATE~~
 Authors@R: c(
-    person("Yu", "Shi", email = "yushi2@microsoft.com", role = c("aut", "cre")),
+    person("Yu", "Shi", email = "yushi2@microsoft.com", role = c("aut")),
     person("Guolin", "Ke", email = "guolin.ke@outlook.com", role = c("aut")),
     person("Damien", "Soukhavong", email = "damien.soukhavong@skema.edu", role = c("aut")),
-    person("James", "Lamb", email="jaylamb20@gmail.com", role = c("aut")),
+    person("James", "Lamb", email="jaylamb20@gmail.com", role = c("aut", "cre")),
     person("Qi", "Meng", role = c("aut")),
     person("Thomas", "Finley", role = c("aut")),
     person("Taifeng", "Wang", role = c("aut")),
@@ -46,9 +46,9 @@ Biarch: true
 VignetteBuilder: knitr
 Suggests:
     knitr,
+    markdown,
     processx,
     RhpcBLASctl,
-    rmarkdown,
     testthat
 Depends:
     R (>= 3.5)
diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE
index e07af84d8824..718f0e55a0d7 100644
--- a/R-package/NAMESPACE
+++ b/R-package/NAMESPACE
@@ -9,6 +9,7 @@ S3method(print,lgb.Booster)
 S3method(set_field,lgb.Dataset)
 S3method(slice,lgb.Dataset)
 S3method(summary,lgb.Booster)
+export(getLGBMthreads)
 export(get_field)
 export(lgb.Dataset)
 export(lgb.Dataset.construct)
@@ -33,8 +34,7 @@ export(lgb.restore_handle)
 export(lgb.save)
 export(lgb.train)
 export(lightgbm)
-export(readRDS.lgb.Booster)
-export(saveRDS.lgb.Booster)
+export(setLGBMthreads)
 export(set_field)
 export(slice)
 import(methods)
diff --git a/R-package/R/callback.R b/R-package/R/callback.R
index e428dfb79eea..c436409ddafb 100644
--- a/R-package/R/callback.R
+++ b/R-package/R/callback.R
@@ -90,7 +90,7 @@ cb_print_evaluation <- function(period) {
 
         # Check if message is existing
         if (nchar(msg) > 0L) {
-          print(.merge_eval_string(env = env))
+          cat(.merge_eval_string(env = env), "\n")
         }
 
       }
@@ -208,9 +208,9 @@ cb_early_stop <- function(stopping_rounds, first_metric_only, verbose) {
       msg <- paste0(
         "Will train until there is no improvement in "
         , stopping_rounds
-        , " rounds."
+        , " rounds.\n"
       )
-      print(msg)
+      cat(msg)
     }
 
     # Internally treat everything as a maximization task
@@ -284,7 +284,7 @@ cb_early_stop <- function(stopping_rounds, first_metric_only, verbose) {
             }
 
             if (isTRUE(verbose)) {
-              print(paste0("Early stopping, best iteration is: ", best_msg[[i]]))
+              cat(paste0("Early stopping, best iteration is: ", best_msg[[i]], "\n"))
             }
 
             # Store best iteration and stop
@@ -302,7 +302,7 @@ cb_early_stop <- function(stopping_rounds, first_metric_only, verbose) {
         }
 
         if (isTRUE(verbose)) {
-          print(paste0("Did not meet early stopping, best iteration is: ", best_msg[[i]]))
+          cat(paste0("Did not meet early stopping, best iteration is: ", best_msg[[i]], "\n"))
         }
 
         # Store best iteration and stop
@@ -323,17 +323,17 @@ cb_early_stop <- function(stopping_rounds, first_metric_only, verbose) {
 }
 
 # Extract callback names from the list of callbacks
-callback.names <- function(cb_list) {
+.callback_names <- function(cb_list) {
   return(unlist(lapply(cb_list, attr, "name")))
 }
 
-add.cb <- function(cb_list, cb) {
+.add_cb <- function(cb_list, cb) {
 
   # Combine two elements
   cb_list <- c(cb_list, cb)
 
   # Set names of elements
-  names(cb_list) <- callback.names(cb_list = cb_list)
+  names(cb_list) <- .callback_names(cb_list = cb_list)
 
   if ("cb_early_stop" %in% names(cb_list)) {
 
@@ -349,7 +349,7 @@ add.cb <- function(cb_list, cb) {
 
 }
 
-categorize.callbacks <- function(cb_list) {
+.categorize_callbacks <- function(cb_list) {
 
   # Check for pre-iteration or post-iteration
   return(
diff --git a/R-package/R/lgb.Booster.R b/R-package/R/lgb.Booster.R
index 755b171724f9..4437c6fa552e 100644
--- a/R-package/R/lgb.Booster.R
+++ b/R-package/R/lgb.Booster.R
@@ -31,12 +31,12 @@ Booster <- R6::R6Class(
 
       if (!is.null(train_set)) {
 
-        if (!lgb.is.Dataset(train_set)) {
+        if (!.is_Dataset(train_set)) {
           stop("lgb.Booster: Can only use lgb.Dataset as training data")
         }
         train_set_handle <- train_set$.__enclos_env__$private$get_handle()
         params <- utils::modifyList(params, train_set$get_params())
-        params_str <- lgb.params2str(params = params)
+        params_str <- .params2str(params = params)
         # Store booster handle
         handle <- .Call(
           LGBM_BoosterCreate_R
@@ -130,7 +130,7 @@ Booster <- R6::R6Class(
     # Add validation data
     add_valid = function(data, name) {
 
-      if (!lgb.is.Dataset(data)) {
+      if (!.is_Dataset(data)) {
         stop("lgb.Booster.add_valid: Can only use lgb.Dataset as validation data")
       }
 
@@ -167,7 +167,7 @@ Booster <- R6::R6Class(
         params <- utils::modifyList(self$params, params)
       }
 
-      params_str <- lgb.params2str(params = params)
+      params_str <- .params2str(params = params)
 
       self$restore_handle()
 
@@ -193,7 +193,7 @@ Booster <- R6::R6Class(
 
       if (!is.null(train_set)) {
 
-        if (!lgb.is.Dataset(train_set)) {
+        if (!.is_Dataset(train_set)) {
           stop("lgb.Booster.update: Only can use lgb.Dataset as training data")
         }
 
@@ -340,7 +340,7 @@ Booster <- R6::R6Class(
     # Evaluate data on metrics
     eval = function(data, name, feval = NULL) {
 
-      if (!lgb.is.Dataset(data)) {
+      if (!.is_Dataset(data)) {
         stop("lgb.Booster.eval: Can only use lgb.Dataset to eval")
       }
 
@@ -508,17 +508,17 @@ Booster <- R6::R6Class(
       # NOTE: doing this here instead of in Predictor$predict() to keep
       #       Predictor$predict() as fast as possible
       if (length(params) > 0L) {
-        params <- lgb.check.wrapper_param(
+        params <- .check_wrapper_param(
           main_param_name = "predict_raw_score"
           , params = params
           , alternative_kwarg_value = rawscore
         )
-        params <- lgb.check.wrapper_param(
+        params <- .check_wrapper_param(
           main_param_name = "predict_leaf_index"
           , params = params
           , alternative_kwarg_value = predleaf
         )
-        params <- lgb.check.wrapper_param(
+        params <- .check_wrapper_param(
           main_param_name = "predict_contrib"
           , params = params
           , alternative_kwarg_value = predcontrib
@@ -586,7 +586,7 @@ Booster <- R6::R6Class(
         , predcontrib
         , start_iteration
         , num_iteration
-        , lgb.params2str(params = params)
+        , .params2str(params = params)
       )
 
       private$fast_predict_config <- list(
@@ -622,7 +622,7 @@ Booster <- R6::R6Class(
     },
 
     check_null_handle = function() {
-      return(lgb.is.null.handle(private$handle))
+      return(.is_null_handle(private$handle))
     },
 
     restore_handle = function() {
@@ -917,6 +917,8 @@ NULL
 #'         the factor levels not being present in the output.
 #' @examples
 #' \donttest{
+#' \dontshow{setLGBMthreads(2L)}
+#' \dontshow{data.table::setDTthreads(1L)}
 #' data(agaricus.train, package = "lightgbm")
 #' train <- agaricus.train
 #' dtrain <- lgb.Dataset(train$data, label = train$label)
@@ -959,7 +961,7 @@ predict.lgb.Booster <- function(object,
                                 params = list(),
                                 ...) {
 
-  if (!lgb.is.Booster(x = object)) {
+  if (!.is_Booster(x = object)) {
     stop("predict.lgb.Booster: object should be an ", sQuote("lgb.Booster"))
   }
 
@@ -1082,6 +1084,8 @@ predict.lgb.Booster <- function(object,
 #'         \link{predict.lgb.Booster}.
 #' @examples
 #' \donttest{
+#' \dontshow{setLGBMthreads(2L)}
+#' \dontshow{data.table::setDTthreads(1L)}
 #' library(lightgbm)
 #' data(mtcars)
 #' X <- as.matrix(mtcars[, -1L])
@@ -1114,7 +1118,7 @@ lgb.configure_fast_predict <- function(model,
                                        num_iteration = NULL,
                                        type = "response",
                                        params = list()) {
-  if (!lgb.is.Booster(x = model)) {
+  if (!.is_Booster(x = model)) {
     stop("lgb.configure_fast_predict: model should be an ", sQuote("lgb.Booster"))
   }
   if (type == "class") {
@@ -1160,7 +1164,7 @@ lgb.configure_fast_predict <- function(model,
 print.lgb.Booster <- function(x, ...) {
   # nolint start
   handle <- x$.__enclos_env__$private$handle
-  handle_is_null <- lgb.is.null.handle(handle)
+  handle_is_null <- .is_null_handle(handle)
 
   if (!handle_is_null) {
     ntrees <- x$current_iter()
@@ -1224,6 +1228,8 @@ summary.lgb.Booster <- function(object, ...) {
 #'
 #' @examples
 #' \donttest{
+#' \dontshow{setLGBMthreads(2L)}
+#' \dontshow{data.table::setDTthreads(1L)}
 #' data(agaricus.train, package = "lightgbm")
 #' train <- agaricus.train
 #' dtrain <- lgb.Dataset(train$data, label = train$label)
@@ -1289,6 +1295,8 @@ lgb.load <- function(filename = NULL, model_str = NULL) {
 #'
 #' @examples
 #' \donttest{
+#' \dontshow{setLGBMthreads(2L)}
+#' \dontshow{data.table::setDTthreads(1L)}
 #' library(lightgbm)
 #' data(agaricus.train, package = "lightgbm")
 #' train <- agaricus.train
@@ -1316,7 +1324,7 @@ lgb.load <- function(filename = NULL, model_str = NULL) {
 #' @export
 lgb.save <- function(booster, filename, num_iteration = NULL) {
 
-  if (!lgb.is.Booster(x = booster)) {
+  if (!.is_Booster(x = booster)) {
     stop("lgb.save: booster should be an ", sQuote("lgb.Booster"))
   }
 
@@ -1346,6 +1354,8 @@ lgb.save <- function(booster, filename, num_iteration = NULL) {
 #' @examples
 #' \donttest{
 #' library(lightgbm)
+#' \dontshow{setLGBMthreads(2L)}
+#' \dontshow{data.table::setDTthreads(1L)}
 #' data(agaricus.train, package = "lightgbm")
 #' train <- agaricus.train
 #' dtrain <- lgb.Dataset(train$data, label = train$label)
@@ -1372,7 +1382,7 @@ lgb.save <- function(booster, filename, num_iteration = NULL) {
 #' @export
 lgb.dump <- function(booster, num_iteration = NULL) {
 
-  if (!lgb.is.Booster(x = booster)) {
+  if (!.is_Booster(x = booster)) {
     stop("lgb.dump: booster should be an ", sQuote("lgb.Booster"))
   }
 
@@ -1396,6 +1406,8 @@ lgb.dump <- function(booster, num_iteration = NULL) {
 #'
 #' @examples
 #' \donttest{
+#' \dontshow{setLGBMthreads(2L)}
+#' \dontshow{data.table::setDTthreads(1L)}
 #' # train a regression model
 #' data(agaricus.train, package = "lightgbm")
 #' train <- agaricus.train
@@ -1430,7 +1442,7 @@ lgb.dump <- function(booster, num_iteration = NULL) {
 #' @export
 lgb.get.eval.result <- function(booster, data_name, eval_name, iters = NULL, is_err = FALSE) {
 
-  if (!lgb.is.Booster(x = booster)) {
+  if (!.is_Booster(x = booster)) {
     stop("lgb.get.eval.result: Can only use ", sQuote("lgb.Booster"), " to get eval result")
   }
 
@@ -1462,7 +1474,6 @@ lgb.get.eval.result <- function(booster, data_name, eval_name, iters = NULL, is_
       , toString(eval_names)
       , "]"
     ))
-    stop("lgb.get.eval.result: wrong eval name")
   }
 
   result <- booster$record_evals[[data_name]][[eval_name]][[.EVAL_KEY()]]
diff --git a/R-package/R/lgb.DataProcessor.R b/R-package/R/lgb.DataProcessor.R
index fc7061945b62..c35ce4f93bd3 100644
--- a/R-package/R/lgb.DataProcessor.R
+++ b/R-package/R/lgb.DataProcessor.R
@@ -39,7 +39,7 @@ DataProcessor <- R6::R6Class(
             )
           }
           data_num_class <- length(self$factor_levels)
-          params <- lgb.check.wrapper_param(
+          params <- .check_wrapper_param(
               main_param_name = "num_class"
               , params = params
               , alternative_kwarg_value = data_num_class
diff --git a/R-package/R/lgb.Dataset.R b/R-package/R/lgb.Dataset.R
index e2892ea4bae0..ff9b0b4fa38a 100644
--- a/R-package/R/lgb.Dataset.R
+++ b/R-package/R/lgb.Dataset.R
@@ -55,10 +55,10 @@ Dataset <- R6::R6Class(
                           init_score = NULL) {
 
       # validate inputs early to avoid unnecessary computation
-      if (!(is.null(reference) || lgb.is.Dataset(reference))) {
+      if (!(is.null(reference) || .is_Dataset(reference))) {
           stop("lgb.Dataset: If provided, reference must be a ", sQuote("lgb.Dataset"))
       }
-      if (!(is.null(predictor) || lgb.is.Predictor(predictor))) {
+      if (!(is.null(predictor) || .is_Predictor(predictor))) {
           stop("lgb.Dataset: If provided, predictor must be a ", sQuote("lgb.Predictor"))
       }
 
@@ -135,7 +135,7 @@ Dataset <- R6::R6Class(
     construct = function() {
 
       # Check for handle null
-      if (!lgb.is.null.handle(x = private$handle)) {
+      if (!.is_null_handle(x = private$handle)) {
         return(invisible(self))
       }
 
@@ -191,7 +191,7 @@ Dataset <- R6::R6Class(
       }
 
       # Generate parameter str
-      params_str <- lgb.params2str(params = private$params)
+      params_str <- .params2str(params = private$params)
 
       # Get handle of reference dataset
       ref_handle <- NULL
@@ -277,7 +277,7 @@ Dataset <- R6::R6Class(
         )
 
       }
-      if (lgb.is.null.handle(x = handle)) {
+      if (.is_null_handle(x = handle)) {
         stop("lgb.Dataset.construct: cannot create Dataset handle")
       }
       # Setup class and private type
@@ -345,7 +345,7 @@ Dataset <- R6::R6Class(
     dim = function() {
 
       # Check for handle
-      if (!lgb.is.null.handle(x = private$handle)) {
+      if (!.is_null_handle(x = private$handle)) {
 
         num_row <- 0L
         num_col <- 0L
@@ -385,7 +385,7 @@ Dataset <- R6::R6Class(
 
     # Get number of bins for feature
     get_feature_num_bin = function(feature) {
-      if (lgb.is.null.handle(x = private$handle)) {
+      if (.is_null_handle(x = private$handle)) {
         stop("Cannot get number of bins in feature before constructing Dataset.")
       }
       if (is.character(feature)) {
@@ -409,7 +409,7 @@ Dataset <- R6::R6Class(
     get_colnames = function() {
 
       # Check for handle
-      if (!lgb.is.null.handle(x = private$handle)) {
+      if (!.is_null_handle(x = private$handle)) {
         private$colnames <- .Call(
           LGBM_DatasetGetFeatureNames_R
           , private$handle
@@ -449,7 +449,7 @@ Dataset <- R6::R6Class(
 
       # Write column names
       private$colnames <- colnames
-      if (!lgb.is.null.handle(x = private$handle)) {
+      if (!.is_null_handle(x = private$handle)) {
 
         # Merge names with tab separation
         merged_name <- paste0(as.list(private$colnames), collapse = "\t")
@@ -478,7 +478,7 @@ Dataset <- R6::R6Class(
       # Check for info name and handle
       if (is.null(private$info[[field_name]])) {
 
-        if (lgb.is.null.handle(x = private$handle)) {
+        if (.is_null_handle(x = private$handle)) {
           stop("Cannot perform Dataset$get_field() before constructing Dataset.")
         }
 
@@ -536,7 +536,7 @@ Dataset <- R6::R6Class(
       # Store information privately
       private$info[[field_name]] <- data
 
-      if (!lgb.is.null.handle(x = private$handle) && !is.null(data)) {
+      if (!.is_null_handle(x = private$handle) && !is.null(data)) {
 
         if (length(data) > 0L) {
 
@@ -583,14 +583,14 @@ Dataset <- R6::R6Class(
         return(invisible(self))
       }
       new_params <- utils::modifyList(private$params, params)
-      if (lgb.is.null.handle(x = private$handle)) {
+      if (.is_null_handle(x = private$handle)) {
         private$params <- new_params
       } else {
         tryCatch({
           .Call(
             LGBM_DatasetUpdateParamChecking_R
-            , lgb.params2str(params = private$params)
-            , lgb.params2str(params = new_params)
+            , .params2str(params = private$params)
+            , .params2str(params = new_params)
           )
           private$params <- new_params
         }, error = function(e) {
@@ -663,7 +663,7 @@ Dataset <- R6::R6Class(
           please set ", sQuote("free_raw_data = FALSE"), " when you construct lgb.Dataset")
       }
 
-      if (!lgb.is.Dataset(reference)) {
+      if (!.is_Dataset(reference)) {
         stop("set_reference: Can only use lgb.Dataset as a reference")
       }
 
@@ -711,7 +711,7 @@ Dataset <- R6::R6Class(
     get_handle = function() {
 
       # Get handle and construct if needed
-      if (lgb.is.null.handle(x = private$handle)) {
+      if (.is_null_handle(x = private$handle)) {
         self$construct()
       }
       return(private$handle)
@@ -734,7 +734,7 @@ Dataset <- R6::R6Class(
       if (!is.null(predictor)) {
 
         # Predictor is unknown
-        if (!lgb.is.Predictor(predictor)) {
+        if (!.is_Predictor(predictor)) {
           stop("set_predictor: Can only use lgb.Predictor as predictor")
         }
 
@@ -780,6 +780,8 @@ Dataset <- R6::R6Class(
 #'
 #' @examples
 #' \donttest{
+#' \dontshow{setLGBMthreads(2L)}
+#' \dontshow{data.table::setDTthreads(1L)}
 #' data(agaricus.train, package = "lightgbm")
 #' train <- agaricus.train
 #' dtrain <- lgb.Dataset(train$data, label = train$label)
@@ -837,6 +839,8 @@ lgb.Dataset <- function(data,
 #'
 #' @examples
 #' \donttest{
+#' \dontshow{setLGBMthreads(2L)}
+#' \dontshow{data.table::setDTthreads(1L)}
 #' data(agaricus.train, package = "lightgbm")
 #' train <- agaricus.train
 #' dtrain <- lgb.Dataset(train$data, label = train$label)
@@ -888,7 +892,7 @@ lgb.Dataset.create.valid <- function(dataset,
                                      init_score = NULL,
                                      params = list()) {
 
-  if (!lgb.is.Dataset(x = dataset)) {
+  if (!.is_Dataset(x = dataset)) {
     stop("lgb.Dataset.create.valid: input data should be an lgb.Dataset object")
   }
 
@@ -913,6 +917,8 @@ lgb.Dataset.create.valid <- function(dataset,
 #'
 #' @examples
 #' \donttest{
+#' \dontshow{setLGBMthreads(2L)}
+#' \dontshow{data.table::setDTthreads(1L)}
 #' data(agaricus.train, package = "lightgbm")
 #' train <- agaricus.train
 #' dtrain <- lgb.Dataset(train$data, label = train$label)
@@ -922,7 +928,7 @@ lgb.Dataset.create.valid <- function(dataset,
 #' @export
 lgb.Dataset.construct <- function(dataset) {
 
-  if (!lgb.is.Dataset(x = dataset)) {
+  if (!.is_Dataset(x = dataset)) {
     stop("lgb.Dataset.construct: input data should be an lgb.Dataset object")
   }
 
@@ -942,6 +948,8 @@ lgb.Dataset.construct <- function(dataset) {
 #'
 #' @examples
 #' \donttest{
+#' \dontshow{setLGBMthreads(2L)}
+#' \dontshow{data.table::setDTthreads(1L)}
 #' data(agaricus.train, package = "lightgbm")
 #' train <- agaricus.train
 #' dtrain <- lgb.Dataset(train$data, label = train$label)
@@ -954,7 +962,7 @@ lgb.Dataset.construct <- function(dataset) {
 #' @export
 dim.lgb.Dataset <- function(x) {
 
-  if (!lgb.is.Dataset(x = x)) {
+  if (!.is_Dataset(x = x)) {
     stop("dim.lgb.Dataset: input data should be an lgb.Dataset object")
   }
 
@@ -975,6 +983,8 @@ dim.lgb.Dataset <- function(x) {
 #'
 #' @examples
 #' \donttest{
+#' \dontshow{setLGBMthreads(2L)}
+#' \dontshow{data.table::setDTthreads(1L)}
 #' data(agaricus.train, package = "lightgbm")
 #' train <- agaricus.train
 #' dtrain <- lgb.Dataset(train$data, label = train$label)
@@ -989,7 +999,7 @@ dim.lgb.Dataset <- function(x) {
 #' @export
 dimnames.lgb.Dataset <- function(x) {
 
-  if (!lgb.is.Dataset(x = x)) {
+  if (!.is_Dataset(x = x)) {
     stop("dimnames.lgb.Dataset: input data should be an lgb.Dataset object")
   }
 
@@ -1045,6 +1055,8 @@ dimnames.lgb.Dataset <- function(x) {
 #'
 #' @examples
 #' \donttest{
+#' \dontshow{setLGBMthreads(2L)}
+#' \dontshow{data.table::setDTthreads(1L)}
 #' data(agaricus.train, package = "lightgbm")
 #' train <- agaricus.train
 #' dtrain <- lgb.Dataset(train$data, label = train$label)
@@ -1062,7 +1074,7 @@ slice <- function(dataset, idxset) {
 #' @export
 slice.lgb.Dataset <- function(dataset, idxset) {
 
-  if (!lgb.is.Dataset(x = dataset)) {
+  if (!.is_Dataset(x = dataset)) {
     stop("slice.lgb.Dataset: input dataset should be an lgb.Dataset object")
   }
 
@@ -1089,6 +1101,8 @@ slice.lgb.Dataset <- function(dataset, idxset) {
 #'
 #' @examples
 #' \donttest{
+#' \dontshow{setLGBMthreads(2L)}
+#' \dontshow{data.table::setDTthreads(1L)}
 #' data(agaricus.train, package = "lightgbm")
 #' train <- agaricus.train
 #' dtrain <- lgb.Dataset(train$data, label = train$label)
@@ -1110,7 +1124,7 @@ get_field <- function(dataset, field_name) {
 get_field.lgb.Dataset <- function(dataset, field_name) {
 
   # Check if dataset is not a dataset
-  if (!lgb.is.Dataset(x = dataset)) {
+  if (!.is_Dataset(x = dataset)) {
     stop("get_field.lgb.Dataset(): input dataset should be an lgb.Dataset object")
   }
 
@@ -1138,6 +1152,8 @@ get_field.lgb.Dataset <- function(dataset, field_name) {
 #'
 #' @examples
 #' \donttest{
+#' \dontshow{setLGBMthreads(2L)}
+#' \dontshow{data.table::setDTthreads(1L)}
 #' data(agaricus.train, package = "lightgbm")
 #' train <- agaricus.train
 #' dtrain <- lgb.Dataset(train$data, label = train$label)
@@ -1158,7 +1174,7 @@ set_field <- function(dataset, field_name, data) {
 #' @export
 set_field.lgb.Dataset <- function(dataset, field_name, data) {
 
-  if (!lgb.is.Dataset(x = dataset)) {
+  if (!.is_Dataset(x = dataset)) {
     stop("set_field.lgb.Dataset: input dataset should be an lgb.Dataset object")
   }
 
@@ -1177,6 +1193,8 @@ set_field.lgb.Dataset <- function(dataset, field_name, data) {
 #'
 #' @examples
 #' \donttest{
+#' \dontshow{setLGBMthreads(2L)}
+#' \dontshow{data.table::setDTthreads(1L)}
 #' data(agaricus.train, package = "lightgbm")
 #' train <- agaricus.train
 #' dtrain <- lgb.Dataset(train$data, label = train$label)
@@ -1189,7 +1207,7 @@ set_field.lgb.Dataset <- function(dataset, field_name, data) {
 #' @export
 lgb.Dataset.set.categorical <- function(dataset, categorical_feature) {
 
-  if (!lgb.is.Dataset(x = dataset)) {
+  if (!.is_Dataset(x = dataset)) {
     stop("lgb.Dataset.set.categorical: input dataset should be an lgb.Dataset object")
   }
 
@@ -1207,6 +1225,8 @@ lgb.Dataset.set.categorical <- function(dataset, categorical_feature) {
 #'
 #' @examples
 #' \donttest{
+#' \dontshow{setLGBMthreads(2L)}
+#' \dontshow{data.table::setDTthreads(1L)}
 #' # create training Dataset
 #' data(agaricus.train, package ="lightgbm")
 #' train <- agaricus.train
@@ -1222,7 +1242,7 @@ lgb.Dataset.set.categorical <- function(dataset, categorical_feature) {
 #' @export
 lgb.Dataset.set.reference <- function(dataset, reference) {
 
-  if (!lgb.is.Dataset(x = dataset)) {
+  if (!.is_Dataset(x = dataset)) {
     stop("lgb.Dataset.set.reference: input dataset should be an lgb.Dataset object")
   }
 
@@ -1240,6 +1260,8 @@ lgb.Dataset.set.reference <- function(dataset, reference) {
 #'
 #' @examples
 #' \donttest{
+#' \dontshow{setLGBMthreads(2L)}
+#' \dontshow{data.table::setDTthreads(1L)}
 #' data(agaricus.train, package = "lightgbm")
 #' train <- agaricus.train
 #' dtrain <- lgb.Dataset(train$data, label = train$label)
@@ -1248,7 +1270,7 @@ lgb.Dataset.set.reference <- function(dataset, reference) {
 #' @export
 lgb.Dataset.save <- function(dataset, fname) {
 
-  if (!lgb.is.Dataset(x = dataset)) {
+  if (!.is_Dataset(x = dataset)) {
     stop("lgb.Dataset.save: input dataset should be an lgb.Dataset object")
   }
 
diff --git a/R-package/R/lgb.Predictor.R b/R-package/R/lgb.Predictor.R
index 0b7b39e2d8c2..3a411efd75ba 100644
--- a/R-package/R/lgb.Predictor.R
+++ b/R-package/R/lgb.Predictor.R
@@ -28,7 +28,7 @@ Predictor <- R6::R6Class(
 
     # Initialize will create a starter model
     initialize = function(modelfile, params = list(), fast_predict_config = list()) {
-      private$params <- lgb.params2str(params = params)
+      private$params <- .params2str(params = params)
       handle <- NULL
 
       if (is.character(modelfile)) {
@@ -46,7 +46,7 @@ Predictor <- R6::R6Class(
         handle <- modelfile
         private$need_free_handle <- FALSE
 
-      } else if (lgb.is.Booster(modelfile)) {
+      } else if (.is_Booster(modelfile)) {
 
         handle <- modelfile$get_handle()
         private$need_free_handle <- FALSE
@@ -512,7 +512,7 @@ Predictor <- R6::R6Class(
         return(FALSE)
       }
 
-      if (lgb.is.null.handle(private$fast_predict_config$handle)) {
+      if (.is_null_handle(private$fast_predict_config$handle)) {
         warning(paste0("Model had fast CSR predict configuration, but it is inactive."
                        , " Try re-generating it through 'lgb.configure_fast_predict'."))
         return(FALSE)
@@ -527,8 +527,8 @@ Predictor <- R6::R6Class(
         private$fast_predict_config$rawscore == rawscore &&
         private$fast_predict_config$predleaf == predleaf &&
         private$fast_predict_config$predcontrib == predcontrib &&
-        lgb.equal.or.both.null(private$fast_predict_config$start_iteration, start_iteration) &&
-        lgb.equal.or.both.null(private$fast_predict_config$num_iteration, num_iteration)
+        .equal_or_both_null(private$fast_predict_config$start_iteration, start_iteration) &&
+        .equal_or_both_null(private$fast_predict_config$num_iteration, num_iteration)
       )
     }
   )
diff --git a/R-package/R/lgb.cv.R b/R-package/R/lgb.cv.R
index f81026fe673f..0545fbf71899 100644
--- a/R-package/R/lgb.cv.R
+++ b/R-package/R/lgb.cv.R
@@ -51,6 +51,8 @@ CVBooster <- R6::R6Class(
 #'
 #' @examples
 #' \donttest{
+#' \dontshow{setLGBMthreads(2L)}
+#' \dontshow{data.table::setDTthreads(1L)}
 #' data(agaricus.train, package = "lightgbm")
 #' train <- agaricus.train
 #' dtrain <- lgb.Dataset(train$data, label = train$label)
@@ -99,7 +101,7 @@ lgb.cv <- function(params = list()
   }
 
   # If 'data' is not an lgb.Dataset, try to construct one using 'label'
-  if (!lgb.is.Dataset(x = data)) {
+  if (!.is_Dataset(x = data)) {
     if (is.null(label)) {
       stop("'label' must be provided for lgb.cv if 'data' is not an 'lgb.Dataset'")
     }
@@ -110,27 +112,27 @@ lgb.cv <- function(params = list()
   # in `params`.
   # this ensures that the model stored with Booster$save() correctly represents
   # what was passed in
-  params <- lgb.check.wrapper_param(
+  params <- .check_wrapper_param(
     main_param_name = "verbosity"
     , params = params
     , alternative_kwarg_value = verbose
   )
-  params <- lgb.check.wrapper_param(
+  params <- .check_wrapper_param(
     main_param_name = "num_iterations"
     , params = params
     , alternative_kwarg_value = nrounds
   )
-  params <- lgb.check.wrapper_param(
+  params <- .check_wrapper_param(
     main_param_name = "metric"
     , params = params
     , alternative_kwarg_value = NULL
   )
-  params <- lgb.check.wrapper_param(
+  params <- .check_wrapper_param(
     main_param_name = "objective"
     , params = params
     , alternative_kwarg_value = obj
   )
-  params <- lgb.check.wrapper_param(
+  params <- .check_wrapper_param(
     main_param_name = "early_stopping_round"
     , params = params
     , alternative_kwarg_value = early_stopping_rounds
@@ -148,7 +150,7 @@ lgb.cv <- function(params = list()
   # (for backwards compatibility). If it is a list of functions, store
   # all of them. This makes it possible to pass any mix of strings like "auc"
   # and custom functions to eval
-  params <- lgb.check.eval(params = params, eval = eval)
+  params <- .check_eval(params = params, eval = eval)
   eval_functions <- list(NULL)
   if (is.function(eval)) {
     eval_functions <- list(eval)
@@ -166,7 +168,7 @@ lgb.cv <- function(params = list()
   # Check for boosting from a trained model
   if (is.character(init_model)) {
     predictor <- Predictor$new(modelfile = init_model)
-  } else if (lgb.is.Booster(x = init_model)) {
+  } else if (.is_Booster(x = init_model)) {
     predictor <- init_model$to_predictor()
   }
 
@@ -193,7 +195,7 @@ lgb.cv <- function(params = list()
   } else if (!is.null(data$get_colnames())) {
     cnames <- data$get_colnames()
   }
-  params[["interaction_constraints"]] <- lgb.check_interaction_constraints(
+  params[["interaction_constraints"]] <- .check_interaction_constraints(
     interaction_constraints = interaction_constraints
     , column_names = cnames
   )
@@ -232,7 +234,7 @@ lgb.cv <- function(params = list()
     }
 
     # Create folds
-    folds <- generate.cv.folds(
+    folds <- .generate_cv_folds(
       nfold = nfold
       , nrows = nrow(data)
       , stratified = stratified
@@ -245,12 +247,12 @@ lgb.cv <- function(params = list()
 
   # Add printing log callback
   if (params[["verbosity"]] > 0L && eval_freq > 0L) {
-    callbacks <- add.cb(cb_list = callbacks, cb = cb_print_evaluation(period = eval_freq))
+    callbacks <- .add_cb(cb_list = callbacks, cb = cb_print_evaluation(period = eval_freq))
   }
 
   # Add evaluation log callback
   if (record) {
-    callbacks <- add.cb(cb_list = callbacks, cb = cb_record_evaluation())
+    callbacks <- .add_cb(cb_list = callbacks, cb = cb_record_evaluation())
   }
 
   # Did user pass parameters that indicate they want to use early stopping?
@@ -282,7 +284,7 @@ lgb.cv <- function(params = list()
 
   # If user supplied early_stopping_rounds, add the early stopping callback
   if (using_early_stopping) {
-    callbacks <- add.cb(
+    callbacks <- .add_cb(
       cb_list = callbacks
       , cb = cb_early_stop(
         stopping_rounds = early_stopping_rounds
@@ -292,7 +294,7 @@ lgb.cv <- function(params = list()
     )
   }
 
-  cb <- categorize.callbacks(cb_list = callbacks)
+  cb <- .categorize_callbacks(cb_list = callbacks)
 
   # Construct booster for each fold. The data.table() code below is used to
   # guarantee that indices are sorted while keeping init_score and weight together
@@ -387,7 +389,7 @@ lgb.cv <- function(params = list()
     })
 
     # Prepare collection of evaluation results
-    merged_msg <- lgb.merge.cv.result(
+    merged_msg <- .merge_cv_result(
       msg = msg
       , showsd = showsd
     )
@@ -463,7 +465,7 @@ lgb.cv <- function(params = list()
 }
 
 # Generates random (stratified if needed) CV folds
-generate.cv.folds <- function(nfold, nrows, stratified, label, group, params) {
+.generate_cv_folds <- function(nfold, nrows, stratified, label, group, params) {
 
   # Check for group existence
   if (is.null(group)) {
@@ -476,7 +478,7 @@ generate.cv.folds <- function(nfold, nrows, stratified, label, group, params) {
 
       y <- label[rnd_idx]
       y <- as.factor(y)
-      folds <- lgb.stratified.folds(y = y, k = nfold)
+      folds <- .stratified_folds(y = y, k = nfold)
 
     } else {
 
@@ -528,7 +530,7 @@ generate.cv.folds <- function(nfold, nrows, stratified, label, group, params) {
 # It was borrowed from caret::createFolds and simplified
 # by always returning an unnamed list of fold indices.
 #' @importFrom stats quantile
-lgb.stratified.folds <- function(y, k) {
+.stratified_folds <- function(y, k) {
 
   # Group the numeric data based on their magnitudes
   # and sample within those groups.
@@ -594,7 +596,7 @@ lgb.stratified.folds <- function(y, k) {
   return(out)
 }
 
-lgb.merge.cv.result <- function(msg, showsd) {
+.merge_cv_result <- function(msg, showsd) {
 
   if (length(msg) == 0L) {
     stop("lgb.cv: size of cv result error")
diff --git a/R-package/R/lgb.drop_serialized.R b/R-package/R/lgb.drop_serialized.R
index bcc2480e8ccc..e53f2cafac11 100644
--- a/R-package/R/lgb.drop_serialized.R
+++ b/R-package/R/lgb.drop_serialized.R
@@ -13,7 +13,7 @@
 #' @seealso \link{lgb.restore_handle}, \link{lgb.make_serializable}.
 #' @export
 lgb.drop_serialized <- function(model) {
-  if (!lgb.is.Booster(x = model)) {
+  if (!.is_Booster(x = model)) {
     stop("lgb.drop_serialized: model should be an ", sQuote("lgb.Booster"))
   }
   model$drop_raw()
diff --git a/R-package/R/lgb.importance.R b/R-package/R/lgb.importance.R
index 5a58770553f9..7c76131f4f53 100644
--- a/R-package/R/lgb.importance.R
+++ b/R-package/R/lgb.importance.R
@@ -14,6 +14,8 @@
 #'
 #' @examples
 #' \donttest{
+#' \dontshow{setLGBMthreads(2L)}
+#' \dontshow{data.table::setDTthreads(1L)}
 #' data(agaricus.train, package = "lightgbm")
 #' train <- agaricus.train
 #' dtrain <- lgb.Dataset(train$data, label = train$label)
@@ -39,7 +41,7 @@
 #' @export
 lgb.importance <- function(model, percentage = TRUE) {
 
-  if (!lgb.is.Booster(x = model)) {
+  if (!.is_Booster(x = model)) {
     stop("'model' has to be an object of class lgb.Booster")
   }
 
diff --git a/R-package/R/lgb.interprete.R b/R-package/R/lgb.interprete.R
index 7de772664d8b..8f93d45429f1 100644
--- a/R-package/R/lgb.interprete.R
+++ b/R-package/R/lgb.interprete.R
@@ -17,6 +17,8 @@
 #'
 #' @examples
 #' \donttest{
+#' \dontshow{setLGBMthreads(2L)}
+#' \dontshow{data.table::setDTthreads(1L)}
 #' Logit <- function(x) log(x / (1.0 - x))
 #' data(agaricus.train, package = "lightgbm")
 #' train <- agaricus.train
@@ -86,7 +88,7 @@ lgb.interprete <- function(model,
   )
 
   for (i in seq_along(idxset)) {
-    tree_interpretation_dt_list[[i]] <- single.row.interprete(
+    tree_interpretation_dt_list[[i]] <- .single_row_interprete(
       tree_dt = tree_dt
       , num_class = num_class
       , tree_index_mat = tree_index_mat_list[[i]]
@@ -151,7 +153,7 @@ single.tree.interprete <- function(tree_dt,
 }
 
 #' @importFrom data.table := rbindlist setorder
-multiple.tree.interprete <- function(tree_dt,
+.multiple_tree_interprete <- function(tree_dt,
                                      tree_index,
                                      leaf_index) {
 
@@ -186,7 +188,7 @@ multiple.tree.interprete <- function(tree_dt,
 }
 
 #' @importFrom data.table set setnames
-single.row.interprete <- function(tree_dt, num_class, tree_index_mat, leaf_index_mat) {
+.single_row_interprete <- function(tree_dt, num_class, tree_index_mat, leaf_index_mat) {
 
   # Prepare vector list
   tree_interpretation <- vector(mode = "list", length = num_class)
@@ -194,7 +196,7 @@ single.row.interprete <- function(tree_dt, num_class, tree_index_mat, leaf_index
   # Loop throughout each class
   for (i in seq_len(num_class)) {
 
-    next_interp_dt <- multiple.tree.interprete(
+    next_interp_dt <- .multiple_tree_interprete(
       tree_dt = tree_dt
       , tree_index = tree_index_mat[, i]
       , leaf_index = leaf_index_mat[, i]
diff --git a/R-package/R/lgb.make_serializable.R b/R-package/R/lgb.make_serializable.R
index 58bdd194df4d..5a639aacb2b5 100644
--- a/R-package/R/lgb.make_serializable.R
+++ b/R-package/R/lgb.make_serializable.R
@@ -13,7 +13,7 @@
 #' @seealso \link{lgb.restore_handle}, \link{lgb.drop_serialized}.
 #' @export
 lgb.make_serializable <- function(model) {
-  if (!lgb.is.Booster(x = model)) {
+  if (!.is_Booster(x = model)) {
     stop("lgb.make_serializable: model should be an ", sQuote("lgb.Booster"))
   }
   model$save_raw()
diff --git a/R-package/R/lgb.model.dt.tree.R b/R-package/R/lgb.model.dt.tree.R
index 8b0d8d81e2e8..bf4562e41018 100644
--- a/R-package/R/lgb.model.dt.tree.R
+++ b/R-package/R/lgb.model.dt.tree.R
@@ -29,6 +29,8 @@
 #'
 #' @examples
 #' \donttest{
+#' \dontshow{setLGBMthreads(2L)}
+#' \dontshow{data.table::setDTthreads(1L)}
 #' data(agaricus.train, package = "lightgbm")
 #' train <- agaricus.train
 #' dtrain <- lgb.Dataset(train$data, label = train$label)
@@ -62,7 +64,10 @@ lgb.model.dt.tree <- function(model, num_iteration = NULL) {
   )
 
   # Parse tree model
-  tree_list <- lapply(parsed_json_model$tree_info, single.tree.parse)
+  tree_list <- lapply(
+    X = parsed_json_model$tree_info
+    , FUN = .single_tree_parse
+  )
 
   # Combine into single data.table
   tree_dt <- data.table::rbindlist(l = tree_list, use.names = TRUE)
@@ -84,7 +89,7 @@ lgb.model.dt.tree <- function(model, num_iteration = NULL) {
 
 
 #' @importFrom data.table := data.table rbindlist
-single.tree.parse <- function(lgb_tree) {
+.single_tree_parse <- function(lgb_tree) {
 
   # Traverse tree function
   pre_order_traversal <- function(env = NULL, tree_node_leaf, current_depth = 0L, parent_index = NA_integer_) {
diff --git a/R-package/R/lgb.plot.importance.R b/R-package/R/lgb.plot.importance.R
index fc59ebd0efec..b8a90ca158ae 100644
--- a/R-package/R/lgb.plot.importance.R
+++ b/R-package/R/lgb.plot.importance.R
@@ -19,6 +19,8 @@
 #'
 #' @examples
 #' \donttest{
+#' \dontshow{setLGBMthreads(2L)}
+#' \dontshow{data.table::setDTthreads(1L)}
 #' data(agaricus.train, package = "lightgbm")
 #' train <- agaricus.train
 #' dtrain <- lgb.Dataset(train$data, label = train$label)
diff --git a/R-package/R/lgb.plot.interpretation.R b/R-package/R/lgb.plot.interpretation.R
index a88f14bf83f0..97650f30a7d3 100644
--- a/R-package/R/lgb.plot.interpretation.R
+++ b/R-package/R/lgb.plot.interpretation.R
@@ -16,6 +16,8 @@
 #'
 #' @examples
 #' \donttest{
+#' \dontshow{setLGBMthreads(2L)}
+#' \dontshow{data.table::setDTthreads(1L)}
 #' Logit <- function(x) {
 #'   log(x / (1.0 - x))
 #' }
@@ -89,7 +91,7 @@ lgb.plot.interpretation <- function(tree_interpretation_dt,
   if (num_class == 1L) {
 
     # Only one class, plot straight away
-    multiple.tree.plot.interpretation(
+    .multiple_tree_plot_interpretation(
       tree_interpretation = tree_interpretation_dt
       , top_n = top_n
       , title = NULL
@@ -118,7 +120,7 @@ lgb.plot.interpretation <- function(tree_interpretation_dt,
         , old = names(plot_dt)
         , new = c("Feature", "Contribution")
       )
-      multiple.tree.plot.interpretation(
+      .multiple_tree_plot_interpretation(
         tree_interpretation = plot_dt
         , top_n = top_n
         , title = paste("Class", i - 1L)
@@ -131,7 +133,7 @@ lgb.plot.interpretation <- function(tree_interpretation_dt,
 }
 
 #' @importFrom graphics barplot
-multiple.tree.plot.interpretation <- function(tree_interpretation,
+.multiple_tree_plot_interpretation <- function(tree_interpretation,
                                               top_n,
                                               title,
                                               cex) {
diff --git a/R-package/R/lgb.restore_handle.R b/R-package/R/lgb.restore_handle.R
index 4de93d46c96a..8a24cc628ca9 100644
--- a/R-package/R/lgb.restore_handle.R
+++ b/R-package/R/lgb.restore_handle.R
@@ -16,7 +16,10 @@
 #' @return \code{lgb.Booster} (the same `model` object that was passed as input, invisibly).
 #' @seealso \link{lgb.make_serializable}, \link{lgb.drop_serialized}.
 #' @examples
+#' \donttest{
 #' library(lightgbm)
+#' \dontshow{setLGBMthreads(2L)}
+#' \dontshow{data.table::setDTthreads(1L)}
 #' data("agaricus.train")
 #' model <- lightgbm(
 #'   agaricus.train$data
@@ -33,9 +36,10 @@
 #' model_new$check_null_handle()
 #' lgb.restore_handle(model_new)
 #' model_new$check_null_handle()
+#' }
 #' @export
 lgb.restore_handle <- function(model) {
-  if (!lgb.is.Booster(x = model)) {
+  if (!.is_Booster(x = model)) {
     stop("lgb.restore_handle: model should be an ", sQuote("lgb.Booster"))
   }
   model$restore_handle()
diff --git a/R-package/R/lgb.train.R b/R-package/R/lgb.train.R
index 20916c9844b5..8a299fb6b8ac 100644
--- a/R-package/R/lgb.train.R
+++ b/R-package/R/lgb.train.R
@@ -19,6 +19,8 @@
 #'
 #' @examples
 #' \donttest{
+#' \dontshow{setLGBMthreads(2L)}
+#' \dontshow{data.table::setDTthreads(1L)}
 #' data(agaricus.train, package = "lightgbm")
 #' train <- agaricus.train
 #' dtrain <- lgb.Dataset(train$data, label = train$label)
@@ -63,11 +65,11 @@ lgb.train <- function(params = list(),
   if (nrounds <= 0L) {
     stop("nrounds should be greater than zero")
   }
-  if (!lgb.is.Dataset(x = data)) {
+  if (!.is_Dataset(x = data)) {
     stop("lgb.train: data must be an lgb.Dataset instance")
   }
   if (length(valids) > 0L) {
-    if (!identical(class(valids), "list") || !all(vapply(valids, lgb.is.Dataset, logical(1L)))) {
+    if (!identical(class(valids), "list") || !all(vapply(valids, .is_Dataset, logical(1L)))) {
       stop("lgb.train: valids must be a list of lgb.Dataset elements")
     }
     evnames <- names(valids)
@@ -80,27 +82,27 @@ lgb.train <- function(params = list(),
   # in `params`.
   # this ensures that the model stored with Booster$save() correctly represents
   # what was passed in
-  params <- lgb.check.wrapper_param(
+  params <- .check_wrapper_param(
     main_param_name = "verbosity"
     , params = params
     , alternative_kwarg_value = verbose
   )
-  params <- lgb.check.wrapper_param(
+  params <- .check_wrapper_param(
     main_param_name = "num_iterations"
     , params = params
     , alternative_kwarg_value = nrounds
   )
-  params <- lgb.check.wrapper_param(
+  params <- .check_wrapper_param(
     main_param_name = "metric"
     , params = params
     , alternative_kwarg_value = NULL
   )
-  params <- lgb.check.wrapper_param(
+  params <- .check_wrapper_param(
     main_param_name = "objective"
     , params = params
     , alternative_kwarg_value = obj
   )
-  params <- lgb.check.wrapper_param(
+  params <- .check_wrapper_param(
     main_param_name = "early_stopping_round"
     , params = params
     , alternative_kwarg_value = early_stopping_rounds
@@ -118,7 +120,7 @@ lgb.train <- function(params = list(),
   # (for backwards compatibility). If it is a list of functions, store
   # all of them. This makes it possible to pass any mix of strings like "auc"
   # and custom functions to eval
-  params <- lgb.check.eval(params = params, eval = eval)
+  params <- .check_eval(params = params, eval = eval)
   eval_functions <- list(NULL)
   if (is.function(eval)) {
     eval_functions <- list(eval)
@@ -136,7 +138,7 @@ lgb.train <- function(params = list(),
   # Check for boosting from a trained model
   if (is.character(init_model)) {
     predictor <- Predictor$new(modelfile = init_model)
-  } else if (lgb.is.Booster(x = init_model)) {
+  } else if (.is_Booster(x = init_model)) {
     predictor <- init_model$to_predictor()
   }
 
@@ -166,7 +168,7 @@ lgb.train <- function(params = list(),
   } else if (!is.null(data$get_colnames())) {
     cnames <- data$get_colnames()
   }
-  params[["interaction_constraints"]] <- lgb.check_interaction_constraints(
+  params[["interaction_constraints"]] <- .check_interaction_constraints(
     interaction_constraints = interaction_constraints
     , column_names = cnames
   )
@@ -212,12 +214,18 @@ lgb.train <- function(params = list(),
 
   # Add printing log callback
   if (params[["verbosity"]] > 0L && eval_freq > 0L) {
-    callbacks <- add.cb(cb_list = callbacks, cb = cb_print_evaluation(period = eval_freq))
+    callbacks <- .add_cb(
+        cb_list = callbacks
+        , cb = cb_print_evaluation(period = eval_freq)
+    )
   }
 
   # Add evaluation log callback
   if (record && length(valids) > 0L) {
-    callbacks <- add.cb(cb_list = callbacks, cb = cb_record_evaluation())
+    callbacks <- .add_cb(
+        cb_list = callbacks
+        , cb = cb_record_evaluation()
+    )
   }
 
   # Did user pass parameters that indicate they want to use early stopping?
@@ -249,7 +257,7 @@ lgb.train <- function(params = list(),
 
   # If user supplied early_stopping_rounds, add the early stopping callback
   if (using_early_stopping) {
-    callbacks <- add.cb(
+    callbacks <- .add_cb(
       cb_list = callbacks
       , cb = cb_early_stop(
         stopping_rounds = early_stopping_rounds
@@ -259,7 +267,7 @@ lgb.train <- function(params = list(),
     )
   }
 
-  cb <- categorize.callbacks(cb_list = callbacks)
+  cb <- .categorize_callbacks(cb_list = callbacks)
 
   # Construct booster with datasets
   booster <- Booster$new(params = params, train_set = data)
diff --git a/R-package/R/lightgbm.R b/R-package/R/lightgbm.R
index 711b3ef0dc38..e5df7a93fc97 100644
--- a/R-package/R/lightgbm.R
+++ b/R-package/R/lightgbm.R
@@ -184,21 +184,21 @@ lightgbm <- function(data,
   }
 
   if (is.null(num_threads)) {
-    num_threads <- lgb.get.default.num.threads()
+    num_threads <- .get_default_num_threads()
   }
-  params <- lgb.check.wrapper_param(
+  params <- .check_wrapper_param(
     main_param_name = "num_threads"
     , params = params
     , alternative_kwarg_value = num_threads
   )
-  params <- lgb.check.wrapper_param(
+  params <- .check_wrapper_param(
     main_param_name = "verbosity"
     , params = params
     , alternative_kwarg_value = verbose
   )
 
   # Process factors as labels and auto-determine objective
-  if (!lgb.is.Dataset(data)) {
+  if (!.is_Dataset(data)) {
     data_processor <- DataProcessor$new()
     temp <- data_processor$process_label(
         label = label
@@ -220,7 +220,7 @@ lightgbm <- function(data,
   dtrain <- data
 
   # Check whether data is lgb.Dataset, if not then create lgb.Dataset manually
-  if (!lgb.is.Dataset(x = dtrain)) {
+  if (!.is_Dataset(x = dtrain)) {
     dtrain <- lgb.Dataset(data = data, label = label, weight = weights, init_score = init_score)
   }
 
diff --git a/R-package/R/multithreading.R b/R-package/R/multithreading.R
new file mode 100644
index 000000000000..a8d6b51a8968
--- /dev/null
+++ b/R-package/R/multithreading.R
@@ -0,0 +1,51 @@
+#' @name setLGBMThreads
+#' @title Set maximum number of threads used by LightGBM
+#' @description LightGBM attempts to speed up many operations by using multi-threading.
+#'              The number of threads used in those operations can be controlled via the
+#'              \code{num_threads} parameter passed through \code{params} to functions like
+#'              \link{lgb.train} and \link{lgb.Dataset}. However, some operations (like materializing
+#'              a model from a text file) are done via code paths that don't explicitly accept thread-control
+#'              configuration.
+#'
+#'              Use this function to set the maximum number of threads LightGBM will use for such operations.
+#'
+#'              This function affects all LightGBM operations in the same process.
+#'
+#'              So, for example, if you call \code{setLGBMthreads(4)}, no other multi-threaded LightGBM
+#'              operation in the same process will use more than 4 threads.
+#'
+#'              Call \code{setLGBMthreads(-1)} to remove this limitation.
+#' @param num_threads maximum number of threads to be used by LightGBM in multi-threaded operations
+#' @return NULL
+#' @seealso \link{getLGBMthreads}
+#' @export
+setLGBMthreads <- function(num_threads) {
+    .Call(
+        LGBM_SetMaxThreads_R,
+        num_threads
+    )
+    return(invisible(NULL))
+}
+
+#' @name getLGBMThreads
+#' @title Get default number of threads used by LightGBM
+#' @description LightGBM attempts to speed up many operations by using multi-threading.
+#'              The number of threads used in those operations can be controlled via the
+#'              \code{num_threads} parameter passed through \code{params} to functions like
+#'              \link{lgb.train} and \link{lgb.Dataset}. However, some operations (like materializing
+#'              a model from a text file) are done via code paths that don't explicitly accept thread-control
+#'              configuration.
+#'
+#'              Use this function to see the default number of threads LightGBM will use for such operations.
+#' @return number of threads as an integer. \code{-1} means that in situations where parameter \code{num_threads} is
+#'         not explicitly supplied, LightGBM will choose a number of threads to use automatically.
+#' @seealso \link{setLGBMthreads}
+#' @export
+getLGBMthreads <- function() {
+    out <- 0L
+    .Call(
+        LGBM_GetMaxThreads_R,
+        out
+    )
+    return(out)
+}
diff --git a/R-package/R/readRDS.lgb.Booster.R b/R-package/R/readRDS.lgb.Booster.R
deleted file mode 100644
index a8abac642c24..000000000000
--- a/R-package/R/readRDS.lgb.Booster.R
+++ /dev/null
@@ -1,48 +0,0 @@
-#' @name readRDS.lgb.Booster
-#' @title readRDS for \code{lgb.Booster} models (DEPRECATED)
-#' @description Calls \code{readRDS} in what is expected to be a serialized \code{lgb.Booster} object,
-#'              and then restores its handle through \code{lgb.restore_handle}.
-#'
-#'              \bold{This function throws a warning and will be removed in future versions.}
-#' @param file a connection or the name of the file where the R object is saved to or read from.
-#' @param refhook a hook function for handling reference objects.
-#'
-#' @return \code{lgb.Booster}
-#'
-#' @examples
-#' \donttest{
-#' library(lightgbm)
-#' data(agaricus.train, package = "lightgbm")
-#' train <- agaricus.train
-#' dtrain <- lgb.Dataset(train$data, label = train$label)
-#' data(agaricus.test, package = "lightgbm")
-#' test <- agaricus.test
-#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
-#' params <- list(
-#'   objective = "regression"
-#'   , metric = "l2"
-#'   , min_data = 1L
-#'   , learning_rate = 1.0
-#'   , num_threads = 2L
-#' )
-#' valids <- list(test = dtest)
-#' model <- lgb.train(
-#'   params = params
-#'   , data = dtrain
-#'   , nrounds = 10L
-#'   , valids = valids
-#'   , early_stopping_rounds = 5L
-#' )
-#' model_file <- tempfile(fileext = ".rds")
-#' saveRDS.lgb.Booster(model, model_file)
-#' new_model <- readRDS.lgb.Booster(model_file)
-#' }
-#' @export
-readRDS.lgb.Booster <- function(file, refhook = NULL) {
-
-  warning("'readRDS.lgb.Booster' is deprecated and will be removed in a future release. Use readRDS() instead.")
-
-  object <- readRDS(file = file, refhook = refhook)
-  lgb.restore_handle(object)
-  return(object)
-}
diff --git a/R-package/R/saveRDS.lgb.Booster.R b/R-package/R/saveRDS.lgb.Booster.R
deleted file mode 100644
index 5d3af097301f..000000000000
--- a/R-package/R/saveRDS.lgb.Booster.R
+++ /dev/null
@@ -1,78 +0,0 @@
-#' @name saveRDS.lgb.Booster
-#' @title saveRDS for \code{lgb.Booster} models (DEPRECATED)
-#' @description Calls \code{saveRDS} on an \code{lgb.Booster} object, making it serializable before the call if
-#'              it isn't already.
-#'
-#'              \bold{This function throws a warning and will be removed in future versions.}
-#' @param object \code{lgb.Booster} object to serialize.
-#' @param file a connection or the name of the file where the R object is saved to or read from.
-#' @param ascii a logical. If TRUE or NA, an ASCII representation is written; otherwise (default),
-#'              a binary one is used. See the comments in the help for save.
-#' @param version the workspace format version to use. \code{NULL} specifies the current default
-#'                version (2). Versions prior to 2 are not supported, so this will only be relevant
-#'                when there are later versions.
-#' @param compress a logical specifying whether saving to a named file is to use "gzip" compression,
-#'                 or one of \code{"gzip"}, \code{"bzip2"} or \code{"xz"} to indicate the type of
-#'                 compression to be used. Ignored if file is a connection.
-#' @param refhook a hook function for handling reference objects.
-#' @param raw whether to save the model in a raw variable or not, recommended to leave it to \code{TRUE}.
-#'
-#' @return NULL invisibly.
-#'
-#' @examples
-#' \donttest{
-#' library(lightgbm)
-#' data(agaricus.train, package = "lightgbm")
-#' train <- agaricus.train
-#' dtrain <- lgb.Dataset(train$data, label = train$label)
-#' data(agaricus.test, package = "lightgbm")
-#' test <- agaricus.test
-#' dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
-#' params <- list(
-#'   objective = "regression"
-#'   , metric = "l2"
-#'   , min_data = 1L
-#'   , learning_rate = 1.0
-#'   , num_threads = 2L
-#' )
-#' valids <- list(test = dtest)
-#' model <- lgb.train(
-#'     params = params
-#'     , data = dtrain
-#'     , nrounds = 10L
-#'     , valids = valids
-#'     , early_stopping_rounds = 5L
-#' )
-#' model_file <- tempfile(fileext = ".rds")
-#' saveRDS.lgb.Booster(model, model_file)
-#' }
-#' @export
-saveRDS.lgb.Booster <- function(object,
-                                file,
-                                ascii = FALSE,
-                                version = NULL,
-                                compress = TRUE,
-                                refhook = NULL,
-                                raw = TRUE) {
-
-  warning("'saveRDS.lgb.Booster' is deprecated and will be removed in a future release. Use saveRDS() instead.")
-
-  if (!lgb.is.Booster(x = object)) {
-    stop("saveRDS.lgb.Booster: object should be an ", sQuote("lgb.Booster"))
-  }
-
-  if (is.null(object$raw)) {
-    lgb.make_serializable(object)
-  }
-
-  saveRDS(
-    object
-    , file = file
-    , ascii = ascii
-    , version = version
-    , compress = compress
-    , refhook = refhook
-  )
-
-  return(invisible(NULL))
-}
diff --git a/R-package/R/utils.R b/R-package/R/utils.R
index c9ba780316df..1ac6f197ca77 100644
--- a/R-package/R/utils.R
+++ b/R-package/R/utils.R
@@ -1,16 +1,16 @@
-lgb.is.Booster <- function(x) {
+.is_Booster <- function(x) {
   return(all(c("R6", "lgb.Booster") %in% class(x)))  # nolint: class_equals
 }
 
-lgb.is.Dataset <- function(x) {
+.is_Dataset <- function(x) {
   return(all(c("R6", "lgb.Dataset") %in% class(x)))  # nolint: class_equals
 }
 
-lgb.is.Predictor <- function(x) {
+.is_Predictor <- function(x) {
   return(all(c("R6", "lgb.Predictor") %in% class(x)))  # nolint: class_equals
 }
 
-lgb.is.null.handle <- function(x) {
+.is_null_handle <- function(x) {
   if (is.null(x)) {
     return(TRUE)
   }
@@ -19,7 +19,7 @@ lgb.is.null.handle <- function(x) {
   )
 }
 
-lgb.params2str <- function(params) {
+.params2str <- function(params) {
 
   if (!identical(class(params), "list")) {
     stop("params must be a list")
@@ -59,7 +59,7 @@ lgb.params2str <- function(params) {
 
 }
 
-lgb.check_interaction_constraints <- function(interaction_constraints, column_names) {
+.check_interaction_constraints <- function(interaction_constraints, column_names) {
 
   # Convert interaction constraints to feature numbers
   string_constraints <- list()
@@ -129,7 +129,7 @@ lgb.check_interaction_constraints <- function(interaction_constraints, column_na
 #     This has to account for the fact that `eval` could be a character vector,
 #     a function, a list of functions, or a list with a mix of strings and
 #     functions
-lgb.check.eval <- function(params, eval) {
+.check_eval <- function(params, eval) {
 
   if (is.null(params$metric)) {
     params$metric <- list()
@@ -194,7 +194,7 @@ lgb.check.eval <- function(params, eval) {
 # [return]
 #     params with num_iterations set to the chosen value, and other aliases
 #     of num_iterations removed
-lgb.check.wrapper_param <- function(main_param_name, params, alternative_kwarg_value) {
+.check_wrapper_param <- function(main_param_name, params, alternative_kwarg_value) {
 
   aliases <- .PARAMETER_ALIASES()[[main_param_name]]
   aliases_provided <- aliases[aliases %in% names(params)]
@@ -225,7 +225,7 @@ lgb.check.wrapper_param <- function(main_param_name, params, alternative_kwarg_v
 }
 
 #' @importFrom parallel detectCores
-lgb.get.default.num.threads <- function() {
+.get_default_num_threads <- function() {
   if (requireNamespace("RhpcBLASctl", quietly = TRUE)) {  # nolint: undesirable_function
     return(RhpcBLASctl::get_num_cores())
   } else {
@@ -247,7 +247,7 @@ lgb.get.default.num.threads <- function() {
   }
 }
 
-lgb.equal.or.both.null <- function(a, b) {
+.equal_or_both_null <- function(a, b) {
   if (is.null(a)) {
     if (!is.null(b)) {
       return(FALSE)
diff --git a/R-package/README.md b/R-package/README.md
index 69f6dba67053..a9dc65b5d7b3 100644
--- a/R-package/README.md
+++ b/R-package/README.md
@@ -97,9 +97,10 @@ After installing `Rtools` and `CMake`, be sure the following paths are added to
     - If you have `Rtools` 4.0, example:
         - `C:\rtools40\mingw64\bin`
         - `C:\rtools40\usr\bin`
-    - If you have `Rtools` 4.2, example:
+    - If you have `Rtools` 4.2+, example:
         - `C:\rtools42\x86_64-w64-mingw32.static.posix\bin`
         - `C:\rtools42\usr\bin`
+        - **NOTE**: this is e.g. `rtools43\` for R 4.3
 * `CMake`
     - example: `C:\Program Files\CMake\bin`
 * `R`
@@ -107,7 +108,7 @@ After installing `Rtools` and `CMake`, be sure the following paths are added to
 
 NOTE: Two `Rtools` paths are required from `Rtools` 4.0 onwards because paths and the list of included software was changed in `Rtools` 4.0.
 
-NOTE: `Rtools42` takes a very different approach to the compiler toolchain than previous releases, and how you install it changes what is required to build packages. See ["Howto: Building R 4.2 and packages on Windows"](https://cran.r-project.org/bin/windows/base/howto-R-4.2.html).
+NOTE: `Rtools42` and later take a very different approach to the compiler toolchain than previous releases, and how you install it changes what is required to build packages. See ["Howto: Building R 4.2 and packages on Windows"](https://cran.r-project.org/bin/windows/base/howto-R-4.2.html).
 
 #### Windows Toolchain Options
 
@@ -427,7 +428,7 @@ docker run \
 
 # install dependencies
 RDscript${R_CUSTOMIZATION} \
-  -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'RhpcBLASctl', 'rmarkdown', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())"
+  -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'markdown', 'Matrix', 'RhpcBLASctl', 'testthat'), repos = 'https://cran.r-project.org', Ncpus = parallel::detectCores())"
 
 # install lightgbm
 sh build-cran-package.sh --r-executable=RD${R_CUSTOMIZATION}
@@ -458,7 +459,7 @@ docker run \
     -it \
         wch1/r-debug
 
-RDscriptvalgrind -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'Matrix', 'RhpcBLASctl', 'rmarkdown', 'testthat'), repos = 'https://cran.rstudio.com', Ncpus = parallel::detectCores())"
+RDscriptvalgrind -e "install.packages(c('R6', 'data.table', 'jsonlite', 'knitr', 'markdown', 'Matrix', 'RhpcBLASctl', 'testthat'), repos = 'https://cran.rstudio.com', Ncpus = parallel::detectCores())"
 
 sh build-cran-package.sh \
     --r-executable=RDvalgrind
diff --git a/R-package/configure b/R-package/configure
index 39a18d669833..37dfdbfbf6c7 100755
--- a/R-package/configure
+++ b/R-package/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.71 for lightgbm 4.1.0.99.
+# Generated by GNU Autoconf 2.71 for lightgbm 4.2.0.99.
 #
 #
 # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@@ -607,8 +607,8 @@ MAKEFLAGS=
 # Identity of this package.
 PACKAGE_NAME='lightgbm'
 PACKAGE_TARNAME='lightgbm'
-PACKAGE_VERSION='4.1.0.99'
-PACKAGE_STRING='lightgbm 4.1.0.99'
+PACKAGE_VERSION='4.2.0.99'
+PACKAGE_STRING='lightgbm 4.2.0.99'
 PACKAGE_BUGREPORT=''
 PACKAGE_URL=''
 
@@ -1211,7 +1211,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures lightgbm 4.1.0.99 to adapt to many kinds of systems.
+\`configure' configures lightgbm 4.2.0.99 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1273,7 +1273,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of lightgbm 4.1.0.99:";;
+     short | recursive ) echo "Configuration of lightgbm 4.2.0.99:";;
    esac
   cat <<\_ACEOF
 
@@ -1341,7 +1341,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-lightgbm configure 4.1.0.99
+lightgbm configure 4.2.0.99
 generated by GNU Autoconf 2.71
 
 Copyright (C) 2021 Free Software Foundation, Inc.
@@ -1378,7 +1378,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by lightgbm $as_me 4.1.0.99, which was
+It was created by lightgbm $as_me 4.2.0.99, which was
 generated by GNU Autoconf 2.71.  Invocation command line was
 
   $ $0$ac_configure_args_raw
@@ -2454,7 +2454,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by lightgbm $as_me 4.1.0.99, which was
+This file was extended by lightgbm $as_me 4.2.0.99, which was
 generated by GNU Autoconf 2.71.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -2509,7 +2509,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
 cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
 ac_cs_config='$ac_cs_config_escaped'
 ac_cs_version="\\
-lightgbm config.status 4.1.0.99
+lightgbm config.status 4.2.0.99
 configured by $0, generated by GNU Autoconf 2.71,
   with options \\"\$ac_cs_config\\"
 
diff --git a/R-package/cran-comments.md b/R-package/cran-comments.md
index 44b8ed391bfc..80f84b924a48 100644
--- a/R-package/cran-comments.md
+++ b/R-package/cran-comments.md
@@ -1,5 +1,16 @@
 # CRAN Submission History
 
+## v4.2.0 - Submission 1 - (December 7, 2023)
+
+### CRAN response
+
+Accepted to CRAN
+
+### Maintainer Notes
+
+This submission included many changes from the last 2 years, as well as fixes for a warning
+CRAN said could cause the package to be archived: https://github.com/microsoft/LightGBM/issues/6221.
+
 ## v4.1.0 - not submitted
 
 v4.1.0 was not submitted to CRAN, because https://github.com/microsoft/LightGBM/issues/5987 had not been resolved.
diff --git a/R-package/demo/basic_walkthrough.R b/R-package/demo/basic_walkthrough.R
index b5bfed26d935..c9ac484e68cb 100644
--- a/R-package/demo/basic_walkthrough.R
+++ b/R-package/demo/basic_walkthrough.R
@@ -1,5 +1,4 @@
 library(lightgbm)
-library(methods)
 
 # We load in the agaricus dataset
 # In this example, we are aiming to predict whether a mushroom is edible
diff --git a/R-package/demo/boost_from_prediction.R b/R-package/demo/boost_from_prediction.R
index 5ee04f2d756e..68d33a73cac0 100644
--- a/R-package/demo/boost_from_prediction.R
+++ b/R-package/demo/boost_from_prediction.R
@@ -1,5 +1,4 @@
 library(lightgbm)
-library(methods)
 
 # Load in the agaricus dataset
 data(agaricus.train, package = "lightgbm")
diff --git a/R-package/demo/early_stopping.R b/R-package/demo/early_stopping.R
index 5179195f02ad..6ca214c5ac7b 100644
--- a/R-package/demo/early_stopping.R
+++ b/R-package/demo/early_stopping.R
@@ -1,5 +1,4 @@
 library(lightgbm)
-library(methods)
 
 # Load in the agaricus dataset
 data(agaricus.train, package = "lightgbm")
diff --git a/R-package/man/dim.Rd b/R-package/man/dim.Rd
index 94ca192d8291..69332d0ec397 100644
--- a/R-package/man/dim.Rd
+++ b/R-package/man/dim.Rd
@@ -21,6 +21,8 @@ be directly used with an \code{lgb.Dataset} object.
 }
 \examples{
 \donttest{
+\dontshow{setLGBMthreads(2L)}
+\dontshow{data.table::setDTthreads(1L)}
 data(agaricus.train, package = "lightgbm")
 train <- agaricus.train
 dtrain <- lgb.Dataset(train$data, label = train$label)
diff --git a/R-package/man/dimnames.lgb.Dataset.Rd b/R-package/man/dimnames.lgb.Dataset.Rd
index ec01a04f607b..85f2085f1d77 100644
--- a/R-package/man/dimnames.lgb.Dataset.Rd
+++ b/R-package/man/dimnames.lgb.Dataset.Rd
@@ -28,6 +28,8 @@ Since row names are irrelevant, it is recommended to use \code{colnames} directl
 }
 \examples{
 \donttest{
+\dontshow{setLGBMthreads(2L)}
+\dontshow{data.table::setDTthreads(1L)}
 data(agaricus.train, package = "lightgbm")
 train <- agaricus.train
 dtrain <- lgb.Dataset(train$data, label = train$label)
diff --git a/R-package/man/getLGBMThreads.Rd b/R-package/man/getLGBMThreads.Rd
new file mode 100644
index 000000000000..21af4f4849d4
--- /dev/null
+++ b/R-package/man/getLGBMThreads.Rd
@@ -0,0 +1,26 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/multithreading.R
+\name{getLGBMThreads}
+\alias{getLGBMThreads}
+\alias{getLGBMthreads}
+\title{Get default number of threads used by LightGBM}
+\usage{
+getLGBMthreads()
+}
+\value{
+number of threads as an integer. \code{-1} means that in situations where parameter \code{num_threads} is
+        not explicitly supplied, LightGBM will choose a number of threads to use automatically.
+}
+\description{
+LightGBM attempts to speed up many operations by using multi-threading.
+             The number of threads used in those operations can be controlled via the
+             \code{num_threads} parameter passed through \code{params} to functions like
+             \link{lgb.train} and \link{lgb.Dataset}. However, some operations (like materializing
+             a model from a text file) are done via code paths that don't explicitly accept thread-control
+             configuration.
+
+             Use this function to see the default number of threads LightGBM will use for such operations.
+}
+\seealso{
+\link{setLGBMthreads}
+}
diff --git a/R-package/man/get_field.Rd b/R-package/man/get_field.Rd
index 1b6692fcf807..e2562cc21364 100644
--- a/R-package/man/get_field.Rd
+++ b/R-package/man/get_field.Rd
@@ -32,6 +32,8 @@ Get one attribute of a \code{lgb.Dataset}
 }
 \examples{
 \donttest{
+\dontshow{setLGBMthreads(2L)}
+\dontshow{data.table::setDTthreads(1L)}
 data(agaricus.train, package = "lightgbm")
 train <- agaricus.train
 dtrain <- lgb.Dataset(train$data, label = train$label)
diff --git a/R-package/man/lgb.Dataset.Rd b/R-package/man/lgb.Dataset.Rd
index 4895600ff922..2605657b060a 100644
--- a/R-package/man/lgb.Dataset.Rd
+++ b/R-package/man/lgb.Dataset.Rd
@@ -65,6 +65,8 @@ Construct \code{lgb.Dataset} object from dense matrix, sparse matrix
 }
 \examples{
 \donttest{
+\dontshow{setLGBMthreads(2L)}
+\dontshow{data.table::setDTthreads(1L)}
 data(agaricus.train, package = "lightgbm")
 train <- agaricus.train
 dtrain <- lgb.Dataset(train$data, label = train$label)
diff --git a/R-package/man/lgb.Dataset.construct.Rd b/R-package/man/lgb.Dataset.construct.Rd
index 97c9e7887602..e400e0a5f8d5 100644
--- a/R-package/man/lgb.Dataset.construct.Rd
+++ b/R-package/man/lgb.Dataset.construct.Rd
@@ -17,6 +17,8 @@ Construct Dataset explicitly
 }
 \examples{
 \donttest{
+\dontshow{setLGBMthreads(2L)}
+\dontshow{data.table::setDTthreads(1L)}
 data(agaricus.train, package = "lightgbm")
 train <- agaricus.train
 dtrain <- lgb.Dataset(train$data, label = train$label)
diff --git a/R-package/man/lgb.Dataset.create.valid.Rd b/R-package/man/lgb.Dataset.create.valid.Rd
index ab8ca753c2b9..fc50dff19986 100644
--- a/R-package/man/lgb.Dataset.create.valid.Rd
+++ b/R-package/man/lgb.Dataset.create.valid.Rd
@@ -48,6 +48,8 @@ Construct validation data according to training data
 }
 \examples{
 \donttest{
+\dontshow{setLGBMthreads(2L)}
+\dontshow{data.table::setDTthreads(1L)}
 data(agaricus.train, package = "lightgbm")
 train <- agaricus.train
 dtrain <- lgb.Dataset(train$data, label = train$label)
diff --git a/R-package/man/lgb.Dataset.save.Rd b/R-package/man/lgb.Dataset.save.Rd
index 5ea38227ba66..b03c2c5e0ac5 100644
--- a/R-package/man/lgb.Dataset.save.Rd
+++ b/R-package/man/lgb.Dataset.save.Rd
@@ -20,6 +20,8 @@ Please note that \code{init_score} is not saved in binary file.
 }
 \examples{
 \donttest{
+\dontshow{setLGBMthreads(2L)}
+\dontshow{data.table::setDTthreads(1L)}
 data(agaricus.train, package = "lightgbm")
 train <- agaricus.train
 dtrain <- lgb.Dataset(train$data, label = train$label)
diff --git a/R-package/man/lgb.Dataset.set.categorical.Rd b/R-package/man/lgb.Dataset.set.categorical.Rd
index 26eb10770e47..5dfcc9a771e8 100644
--- a/R-package/man/lgb.Dataset.set.categorical.Rd
+++ b/R-package/man/lgb.Dataset.set.categorical.Rd
@@ -22,6 +22,8 @@ Set the categorical features of an \code{lgb.Dataset} object. Use this function
 }
 \examples{
 \donttest{
+\dontshow{setLGBMthreads(2L)}
+\dontshow{data.table::setDTthreads(1L)}
 data(agaricus.train, package = "lightgbm")
 train <- agaricus.train
 dtrain <- lgb.Dataset(train$data, label = train$label)
diff --git a/R-package/man/lgb.Dataset.set.reference.Rd b/R-package/man/lgb.Dataset.set.reference.Rd
index 349b0b22913e..a4efbfac5962 100644
--- a/R-package/man/lgb.Dataset.set.reference.Rd
+++ b/R-package/man/lgb.Dataset.set.reference.Rd
@@ -19,6 +19,8 @@ If you want to use validation data, you should set reference to training data
 }
 \examples{
 \donttest{
+\dontshow{setLGBMthreads(2L)}
+\dontshow{data.table::setDTthreads(1L)}
 # create training Dataset
 data(agaricus.train, package ="lightgbm")
 train <- agaricus.train
diff --git a/R-package/man/lgb.configure_fast_predict.Rd b/R-package/man/lgb.configure_fast_predict.Rd
index 39fe6afa6b18..e02600451df5 100644
--- a/R-package/man/lgb.configure_fast_predict.Rd
+++ b/R-package/man/lgb.configure_fast_predict.Rd
@@ -114,6 +114,8 @@ Calling this function multiple times with different parameters might not overrid
 }
 \examples{
 \donttest{
+\dontshow{setLGBMthreads(2L)}
+\dontshow{data.table::setDTthreads(1L)}
 library(lightgbm)
 data(mtcars)
 X <- as.matrix(mtcars[, -1L])
diff --git a/R-package/man/lgb.cv.Rd b/R-package/man/lgb.cv.Rd
index 555cb11c7bb3..7ea2928c6166 100644
--- a/R-package/man/lgb.cv.Rd
+++ b/R-package/man/lgb.cv.Rd
@@ -152,6 +152,8 @@ Cross validation logic used by LightGBM
 
 \examples{
 \donttest{
+\dontshow{setLGBMthreads(2L)}
+\dontshow{data.table::setDTthreads(1L)}
 data(agaricus.train, package = "lightgbm")
 train <- agaricus.train
 dtrain <- lgb.Dataset(train$data, label = train$label)
diff --git a/R-package/man/lgb.dump.Rd b/R-package/man/lgb.dump.Rd
index f4e90242fd75..39f0e3018ac7 100644
--- a/R-package/man/lgb.dump.Rd
+++ b/R-package/man/lgb.dump.Rd
@@ -20,6 +20,8 @@ Dump LightGBM model to json
 \examples{
 \donttest{
 library(lightgbm)
+\dontshow{setLGBMthreads(2L)}
+\dontshow{data.table::setDTthreads(1L)}
 data(agaricus.train, package = "lightgbm")
 train <- agaricus.train
 dtrain <- lgb.Dataset(train$data, label = train$label)
diff --git a/R-package/man/lgb.get.eval.result.Rd b/R-package/man/lgb.get.eval.result.Rd
index 9c2293a0f909..0dc7eb0845c3 100644
--- a/R-package/man/lgb.get.eval.result.Rd
+++ b/R-package/man/lgb.get.eval.result.Rd
@@ -33,6 +33,8 @@ Given a \code{lgb.Booster}, return evaluation results for a
 }
 \examples{
 \donttest{
+\dontshow{setLGBMthreads(2L)}
+\dontshow{data.table::setDTthreads(1L)}
 # train a regression model
 data(agaricus.train, package = "lightgbm")
 train <- agaricus.train
diff --git a/R-package/man/lgb.importance.Rd b/R-package/man/lgb.importance.Rd
index 89a3d4e6b5b7..79cb82f5d8ef 100644
--- a/R-package/man/lgb.importance.Rd
+++ b/R-package/man/lgb.importance.Rd
@@ -25,6 +25,8 @@ Creates a \code{data.table} of feature importances in a model.
 }
 \examples{
 \donttest{
+\dontshow{setLGBMthreads(2L)}
+\dontshow{data.table::setDTthreads(1L)}
 data(agaricus.train, package = "lightgbm")
 train <- agaricus.train
 dtrain <- lgb.Dataset(train$data, label = train$label)
diff --git a/R-package/man/lgb.interprete.Rd b/R-package/man/lgb.interprete.Rd
index c1166b2c1cc9..3acc27955c46 100644
--- a/R-package/man/lgb.interprete.Rd
+++ b/R-package/man/lgb.interprete.Rd
@@ -30,6 +30,8 @@ Computes feature contribution components of rawscore prediction.
 }
 \examples{
 \donttest{
+\dontshow{setLGBMthreads(2L)}
+\dontshow{data.table::setDTthreads(1L)}
 Logit <- function(x) log(x / (1.0 - x))
 data(agaricus.train, package = "lightgbm")
 train <- agaricus.train
diff --git a/R-package/man/lgb.load.Rd b/R-package/man/lgb.load.Rd
index c1a00a20974b..f145db5a245e 100644
--- a/R-package/man/lgb.load.Rd
+++ b/R-package/man/lgb.load.Rd
@@ -20,6 +20,8 @@ Load LightGBM takes in either a file path or model string.
 }
 \examples{
 \donttest{
+\dontshow{setLGBMthreads(2L)}
+\dontshow{data.table::setDTthreads(1L)}
 data(agaricus.train, package = "lightgbm")
 train <- agaricus.train
 dtrain <- lgb.Dataset(train$data, label = train$label)
diff --git a/R-package/man/lgb.model.dt.tree.Rd b/R-package/man/lgb.model.dt.tree.Rd
index 4d02ede9a001..60ef8cdac133 100644
--- a/R-package/man/lgb.model.dt.tree.Rd
+++ b/R-package/man/lgb.model.dt.tree.Rd
@@ -40,6 +40,8 @@ Parse a LightGBM model json dump into a \code{data.table} structure.
 }
 \examples{
 \donttest{
+\dontshow{setLGBMthreads(2L)}
+\dontshow{data.table::setDTthreads(1L)}
 data(agaricus.train, package = "lightgbm")
 train <- agaricus.train
 dtrain <- lgb.Dataset(train$data, label = train$label)
diff --git a/R-package/man/lgb.plot.importance.Rd b/R-package/man/lgb.plot.importance.Rd
index 302f46460e3f..bdf354da0385 100644
--- a/R-package/man/lgb.plot.importance.Rd
+++ b/R-package/man/lgb.plot.importance.Rd
@@ -38,6 +38,8 @@ Features are shown ranked in a decreasing importance order.
 }
 \examples{
 \donttest{
+\dontshow{setLGBMthreads(2L)}
+\dontshow{data.table::setDTthreads(1L)}
 data(agaricus.train, package = "lightgbm")
 train <- agaricus.train
 dtrain <- lgb.Dataset(train$data, label = train$label)
diff --git a/R-package/man/lgb.plot.interpretation.Rd b/R-package/man/lgb.plot.interpretation.Rd
index a914071e896f..6f168e120a4e 100644
--- a/R-package/man/lgb.plot.interpretation.Rd
+++ b/R-package/man/lgb.plot.interpretation.Rd
@@ -35,6 +35,8 @@ contribution of a feature. Features are shown ranked in a decreasing contributio
 }
 \examples{
 \donttest{
+\dontshow{setLGBMthreads(2L)}
+\dontshow{data.table::setDTthreads(1L)}
 Logit <- function(x) {
   log(x / (1.0 - x))
 }
diff --git a/R-package/man/lgb.restore_handle.Rd b/R-package/man/lgb.restore_handle.Rd
index 95cbdc64485d..37922c077642 100644
--- a/R-package/man/lgb.restore_handle.Rd
+++ b/R-package/man/lgb.restore_handle.Rd
@@ -27,7 +27,10 @@ function. If you wish to make fast single-row predictions using a \code{lgb.Boos
 call \link{lgb.configure_fast_predict} on the loaded \code{lgb.Booster} object.
 }
 \examples{
+\donttest{
 library(lightgbm)
+\dontshow{setLGBMthreads(2L)}
+\dontshow{data.table::setDTthreads(1L)}
 data("agaricus.train")
 model <- lightgbm(
   agaricus.train$data
@@ -45,6 +48,7 @@ model_new$check_null_handle()
 lgb.restore_handle(model_new)
 model_new$check_null_handle()
 }
+}
 \seealso{
 \link{lgb.make_serializable}, \link{lgb.drop_serialized}.
 }
diff --git a/R-package/man/lgb.save.Rd b/R-package/man/lgb.save.Rd
index efd110c7d816..62ec0ed462f6 100644
--- a/R-package/man/lgb.save.Rd
+++ b/R-package/man/lgb.save.Rd
@@ -21,6 +21,8 @@ Save LightGBM model
 }
 \examples{
 \donttest{
+\dontshow{setLGBMthreads(2L)}
+\dontshow{data.table::setDTthreads(1L)}
 library(lightgbm)
 data(agaricus.train, package = "lightgbm")
 train <- agaricus.train
diff --git a/R-package/man/lgb.train.Rd b/R-package/man/lgb.train.Rd
index 0f2961edc415..557c85b7f9dc 100644
--- a/R-package/man/lgb.train.Rd
+++ b/R-package/man/lgb.train.Rd
@@ -130,6 +130,8 @@ Low-level R interface to train a LightGBM model. Unlike \code{\link{lightgbm}},
 
 \examples{
 \donttest{
+\dontshow{setLGBMthreads(2L)}
+\dontshow{data.table::setDTthreads(1L)}
 data(agaricus.train, package = "lightgbm")
 train <- agaricus.train
 dtrain <- lgb.Dataset(train$data, label = train$label)
diff --git a/R-package/man/predict.lgb.Booster.Rd b/R-package/man/predict.lgb.Booster.Rd
index 2df13b9bc374..bcb2f3f980fb 100644
--- a/R-package/man/predict.lgb.Booster.Rd
+++ b/R-package/man/predict.lgb.Booster.Rd
@@ -121,6 +121,8 @@ If the model object has been configured for fast single-row predictions through
 }
 \examples{
 \donttest{
+\dontshow{setLGBMthreads(2L)}
+\dontshow{data.table::setDTthreads(1L)}
 data(agaricus.train, package = "lightgbm")
 train <- agaricus.train
 dtrain <- lgb.Dataset(train$data, label = train$label)
diff --git a/R-package/man/readRDS.lgb.Booster.Rd b/R-package/man/readRDS.lgb.Booster.Rd
deleted file mode 100644
index 6a8e4c80ca91..000000000000
--- a/R-package/man/readRDS.lgb.Booster.Rd
+++ /dev/null
@@ -1,51 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/readRDS.lgb.Booster.R
-\name{readRDS.lgb.Booster}
-\alias{readRDS.lgb.Booster}
-\title{readRDS for \code{lgb.Booster} models (DEPRECATED)}
-\usage{
-readRDS.lgb.Booster(file, refhook = NULL)
-}
-\arguments{
-\item{file}{a connection or the name of the file where the R object is saved to or read from.}
-
-\item{refhook}{a hook function for handling reference objects.}
-}
-\value{
-\code{lgb.Booster}
-}
-\description{
-Calls \code{readRDS} in what is expected to be a serialized \code{lgb.Booster} object,
-             and then restores its handle through \code{lgb.restore_handle}.
-
-             \bold{This function throws a warning and will be removed in future versions.}
-}
-\examples{
-\donttest{
-library(lightgbm)
-data(agaricus.train, package = "lightgbm")
-train <- agaricus.train
-dtrain <- lgb.Dataset(train$data, label = train$label)
-data(agaricus.test, package = "lightgbm")
-test <- agaricus.test
-dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
-params <- list(
-  objective = "regression"
-  , metric = "l2"
-  , min_data = 1L
-  , learning_rate = 1.0
-  , num_threads = 2L
-)
-valids <- list(test = dtest)
-model <- lgb.train(
-  params = params
-  , data = dtrain
-  , nrounds = 10L
-  , valids = valids
-  , early_stopping_rounds = 5L
-)
-model_file <- tempfile(fileext = ".rds")
-saveRDS.lgb.Booster(model, model_file)
-new_model <- readRDS.lgb.Booster(model_file)
-}
-}
diff --git a/R-package/man/saveRDS.lgb.Booster.Rd b/R-package/man/saveRDS.lgb.Booster.Rd
deleted file mode 100644
index a8664243dce2..000000000000
--- a/R-package/man/saveRDS.lgb.Booster.Rd
+++ /dev/null
@@ -1,73 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/saveRDS.lgb.Booster.R
-\name{saveRDS.lgb.Booster}
-\alias{saveRDS.lgb.Booster}
-\title{saveRDS for \code{lgb.Booster} models (DEPRECATED)}
-\usage{
-saveRDS.lgb.Booster(
-  object,
-  file,
-  ascii = FALSE,
-  version = NULL,
-  compress = TRUE,
-  refhook = NULL,
-  raw = TRUE
-)
-}
-\arguments{
-\item{object}{\code{lgb.Booster} object to serialize.}
-
-\item{file}{a connection or the name of the file where the R object is saved to or read from.}
-
-\item{ascii}{a logical. If TRUE or NA, an ASCII representation is written; otherwise (default),
-a binary one is used. See the comments in the help for save.}
-
-\item{version}{the workspace format version to use. \code{NULL} specifies the current default
-version (2). Versions prior to 2 are not supported, so this will only be relevant
-when there are later versions.}
-
-\item{compress}{a logical specifying whether saving to a named file is to use "gzip" compression,
-or one of \code{"gzip"}, \code{"bzip2"} or \code{"xz"} to indicate the type of
-compression to be used. Ignored if file is a connection.}
-
-\item{refhook}{a hook function for handling reference objects.}
-
-\item{raw}{whether to save the model in a raw variable or not, recommended to leave it to \code{TRUE}.}
-}
-\value{
-NULL invisibly.
-}
-\description{
-Calls \code{saveRDS} on an \code{lgb.Booster} object, making it serializable before the call if
-             it isn't already.
-
-             \bold{This function throws a warning and will be removed in future versions.}
-}
-\examples{
-\donttest{
-library(lightgbm)
-data(agaricus.train, package = "lightgbm")
-train <- agaricus.train
-dtrain <- lgb.Dataset(train$data, label = train$label)
-data(agaricus.test, package = "lightgbm")
-test <- agaricus.test
-dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
-params <- list(
-  objective = "regression"
-  , metric = "l2"
-  , min_data = 1L
-  , learning_rate = 1.0
-  , num_threads = 2L
-)
-valids <- list(test = dtest)
-model <- lgb.train(
-    params = params
-    , data = dtrain
-    , nrounds = 10L
-    , valids = valids
-    , early_stopping_rounds = 5L
-)
-model_file <- tempfile(fileext = ".rds")
-saveRDS.lgb.Booster(model, model_file)
-}
-}
diff --git a/R-package/man/setLGBMThreads.Rd b/R-package/man/setLGBMThreads.Rd
new file mode 100644
index 000000000000..53336fc2548e
--- /dev/null
+++ b/R-package/man/setLGBMThreads.Rd
@@ -0,0 +1,32 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/multithreading.R
+\name{setLGBMThreads}
+\alias{setLGBMThreads}
+\alias{setLGBMthreads}
+\title{Set maximum number of threads used by LightGBM}
+\usage{
+setLGBMthreads(num_threads)
+}
+\arguments{
+\item{num_threads}{maximum number of threads to be used by LightGBM in multi-threaded operations}
+}
+\description{
+LightGBM attempts to speed up many operations by using multi-threading.
+             The number of threads used in those operations can be controlled via the
+             \code{num_threads} parameter passed through \code{params} to functions like
+             \link{lgb.train} and \link{lgb.Dataset}. However, some operations (like materializing
+             a model from a text file) are done via code paths that don't explicitly accept thread-control
+             configuration.
+
+             Use this function to set the maximum number of threads LightGBM will use for such operations.
+
+             This function affects all LightGBM operations in the same process.
+
+             So, for example, if you call \code{setLGBMthreads(4)}, no other multi-threaded LightGBM
+             operation in the same process will use more than 4 threads.
+
+             Call \code{setLGBMthreads(-1)} to remove this limitation.
+}
+\seealso{
+\link{getLGBMthreads}
+}
diff --git a/R-package/man/set_field.Rd b/R-package/man/set_field.Rd
index f9901e27eefd..2ceebfb87753 100644
--- a/R-package/man/set_field.Rd
+++ b/R-package/man/set_field.Rd
@@ -34,6 +34,8 @@ Set one attribute of a \code{lgb.Dataset}
 }
 \examples{
 \donttest{
+\dontshow{setLGBMthreads(2L)}
+\dontshow{data.table::setDTthreads(1L)}
 data(agaricus.train, package = "lightgbm")
 train <- agaricus.train
 dtrain <- lgb.Dataset(train$data, label = train$label)
diff --git a/R-package/man/slice.Rd b/R-package/man/slice.Rd
index 1d7bec08de0f..a65809a239d8 100644
--- a/R-package/man/slice.Rd
+++ b/R-package/man/slice.Rd
@@ -23,6 +23,8 @@ Get a new \code{lgb.Dataset} containing the specified rows of
 }
 \examples{
 \donttest{
+\dontshow{setLGBMthreads(2L)}
+\dontshow{data.table::setDTthreads(1L)}
 data(agaricus.train, package = "lightgbm")
 train <- agaricus.train
 dtrain <- lgb.Dataset(train$data, label = train$label)
diff --git a/R-package/pkgdown/_pkgdown.yml b/R-package/pkgdown/_pkgdown.yml
index 233a31f0ead9..99a3b1010d41 100644
--- a/R-package/pkgdown/_pkgdown.yml
+++ b/R-package/pkgdown/_pkgdown.yml
@@ -85,8 +85,6 @@ reference:
     - '`lgb.save`'
     - '`lgb.load`'
     - '`lgb.model.dt.tree`'
-    - '`saveRDS.lgb.Booster`'
-    - '`readRDS.lgb.Booster`'
   - title: Model Interpretation
     desc: Analyze your models
     contents:
diff --git a/R-package/src/Makevars.in b/R-package/src/Makevars.in
index ba9ef054bfab..c04263f62c1c 100644
--- a/R-package/src/Makevars.in
+++ b/R-package/src/Makevars.in
@@ -53,5 +53,6 @@ OBJECTS = \
     treelearner/serial_tree_learner.o \
     treelearner/tree_learner.o \
     treelearner/voting_parallel_tree_learner.o \
+    utils/openmp_wrapper.o \
     c_api.o \
     lightgbm_R.o
diff --git a/R-package/src/Makevars.win.in b/R-package/src/Makevars.win.in
index 14f5afde002f..86d56fecdf34 100644
--- a/R-package/src/Makevars.win.in
+++ b/R-package/src/Makevars.win.in
@@ -54,5 +54,6 @@ OBJECTS = \
     treelearner/serial_tree_learner.o \
     treelearner/tree_learner.o \
     treelearner/voting_parallel_tree_learner.o \
+    utils/openmp_wrapper.o \
     c_api.o \
     lightgbm_R.o
diff --git a/R-package/src/lightgbm_R.cpp b/R-package/src/lightgbm_R.cpp
index 82956daef4b9..a76a56c06b24 100644
--- a/R-package/src/lightgbm_R.cpp
+++ b/R-package/src/lightgbm_R.cpp
@@ -40,7 +40,7 @@ void LGBM_R_save_exception_msg(const std::string &err);
   catch(std::exception& ex) { LGBM_R_save_exception_msg(ex); } \
   catch(std::string& ex) { LGBM_R_save_exception_msg(ex); } \
   catch(...) { Rf_error("unknown exception"); } \
-  Rf_error(R_errmsg_buffer); \
+  Rf_error("%s", R_errmsg_buffer); \
   return R_NilValue; /* <- won't be reached */
 
 #define CHECK_CALL(x) \
@@ -224,16 +224,19 @@ SEXP LGBM_DatasetGetSubset_R(SEXP handle,
   _AssertDatasetHandleNotNull(handle);
   SEXP ret = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
   int32_t len = static_cast<int32_t>(Rf_asInteger(len_used_row_indices));
-  std::vector<int32_t> idxvec(len);
+  std::unique_ptr<int32_t[]> idxvec(new int32_t[len]);
   // convert from one-based to zero-based index
-#pragma omp parallel for schedule(static, 512) if (len >= 1024)
+  const int *used_row_indices_ = INTEGER(used_row_indices);
+#ifndef _MSC_VER
+#pragma omp simd
+#endif
   for (int32_t i = 0; i < len; ++i) {
-    idxvec[i] = static_cast<int32_t>(INTEGER(used_row_indices)[i] - 1);
+    idxvec[i] = static_cast<int32_t>(used_row_indices_[i] - 1);
   }
   const char* parameters_ptr = CHAR(PROTECT(Rf_asChar(parameters)));
   DatasetHandle res = nullptr;
   CHECK_CALL(LGBM_DatasetGetSubset(R_ExternalPtrAddr(handle),
-    idxvec.data(), len, parameters_ptr,
+    idxvec.get(), len, parameters_ptr,
     &res));
   R_SetExternalPtrAddr(ret, res);
   R_RegisterCFinalizerEx(ret, _DatasetFinalizer, TRUE);
@@ -247,13 +250,13 @@ SEXP LGBM_DatasetSetFeatureNames_R(SEXP handle,
   R_API_BEGIN();
   _AssertDatasetHandleNotNull(handle);
   auto vec_names = Split(CHAR(PROTECT(Rf_asChar(feature_names))), '\t');
-  std::vector<const char*> vec_sptr;
   int len = static_cast<int>(vec_names.size());
+  std::unique_ptr<const char*[]> vec_sptr(new const char*[len]);
   for (int i = 0; i < len; ++i) {
-    vec_sptr.push_back(vec_names[i].c_str());
+    vec_sptr[i] = vec_names[i].c_str();
   }
   CHECK_CALL(LGBM_DatasetSetFeatureNames(R_ExternalPtrAddr(handle),
-    vec_sptr.data(), len));
+    vec_sptr.get(), len));
   UNPROTECT(1);
   return R_NilValue;
   R_API_END();
@@ -338,21 +341,13 @@ SEXP LGBM_DatasetSetField_R(SEXP handle,
   int len = Rf_asInteger(num_element);
   const char* name = CHAR(PROTECT(Rf_asChar(field_name)));
   if (!strcmp("group", name) || !strcmp("query", name)) {
-    std::vector<int32_t> vec(len);
-#pragma omp parallel for schedule(static, 512) if (len >= 1024)
-    for (int i = 0; i < len; ++i) {
-      vec[i] = static_cast<int32_t>(INTEGER(field_data)[i]);
-    }
-    CHECK_CALL(LGBM_DatasetSetField(R_ExternalPtrAddr(handle), name, vec.data(), len, C_API_DTYPE_INT32));
+    CHECK_CALL(LGBM_DatasetSetField(R_ExternalPtrAddr(handle), name, INTEGER(field_data), len, C_API_DTYPE_INT32));
   } else if (!strcmp("init_score", name)) {
     CHECK_CALL(LGBM_DatasetSetField(R_ExternalPtrAddr(handle), name, REAL(field_data), len, C_API_DTYPE_FLOAT64));
   } else {
-    std::vector<float> vec(len);
-#pragma omp parallel for schedule(static, 512) if (len >= 1024)
-    for (int i = 0; i < len; ++i) {
-      vec[i] = static_cast<float>(REAL(field_data)[i]);
-    }
-    CHECK_CALL(LGBM_DatasetSetField(R_ExternalPtrAddr(handle), name, vec.data(), len, C_API_DTYPE_FLOAT32));
+    std::unique_ptr<float[]> vec(new float[len]);
+    std::copy(REAL(field_data), REAL(field_data) + len, vec.get());
+    CHECK_CALL(LGBM_DatasetSetField(R_ExternalPtrAddr(handle), name, vec.get(), len, C_API_DTYPE_FLOAT32));
   }
   UNPROTECT(1);
   return R_NilValue;
@@ -372,22 +367,19 @@ SEXP LGBM_DatasetGetField_R(SEXP handle,
   if (!strcmp("group", name) || !strcmp("query", name)) {
     auto p_data = reinterpret_cast<const int32_t*>(res);
     // convert from boundaries to size
-#pragma omp parallel for schedule(static, 512) if (out_len >= 1024)
+    int *field_data_ = INTEGER(field_data);
+#ifndef _MSC_VER
+#pragma omp simd
+#endif
     for (int i = 0; i < out_len - 1; ++i) {
-      INTEGER(field_data)[i] = p_data[i + 1] - p_data[i];
+      field_data_[i] = p_data[i + 1] - p_data[i];
     }
   } else if (!strcmp("init_score", name)) {
     auto p_data = reinterpret_cast<const double*>(res);
-#pragma omp parallel for schedule(static, 512) if (out_len >= 1024)
-    for (int i = 0; i < out_len; ++i) {
-      REAL(field_data)[i] = p_data[i];
-    }
+    std::copy(p_data, p_data + out_len, REAL(field_data));
   } else {
     auto p_data = reinterpret_cast<const float*>(res);
-#pragma omp parallel for schedule(static, 512) if (out_len >= 1024)
-    for (int i = 0; i < out_len; ++i) {
-      REAL(field_data)[i] = p_data[i];
-    }
+    std::copy(p_data, p_data + out_len, REAL(field_data));
   }
   UNPROTECT(1);
   return R_NilValue;
@@ -610,13 +602,10 @@ SEXP LGBM_BoosterUpdateOneIterCustom_R(SEXP handle,
   _AssertBoosterHandleNotNull(handle);
   int is_finished = 0;
   int int_len = Rf_asInteger(len);
-  std::vector<float> tgrad(int_len), thess(int_len);
-#pragma omp parallel for schedule(static, 512) if (int_len >= 1024)
-  for (int j = 0; j < int_len; ++j) {
-    tgrad[j] = static_cast<float>(REAL(grad)[j]);
-    thess[j] = static_cast<float>(REAL(hess)[j]);
-  }
-  CHECK_CALL(LGBM_BoosterUpdateOneIterCustom(R_ExternalPtrAddr(handle), tgrad.data(), thess.data(), &is_finished));
+  std::unique_ptr<float[]> tgrad(new float[int_len]), thess(new float[int_len]);
+  std::copy(REAL(grad), REAL(grad) + int_len, tgrad.get());
+  std::copy(REAL(hess), REAL(hess) + int_len, thess.get());
+  CHECK_CALL(LGBM_BoosterUpdateOneIterCustom(R_ExternalPtrAddr(handle), tgrad.get(), thess.get(), &is_finished));
   return R_NilValue;
   R_API_END();
 }
@@ -1204,6 +1193,23 @@ SEXP LGBM_BoosterGetLoadedParam_R(SEXP handle) {
   R_API_END();
 }
 
+SEXP LGBM_GetMaxThreads_R(SEXP out) {
+  R_API_BEGIN();
+  int num_threads;
+  CHECK_CALL(LGBM_GetMaxThreads(&num_threads));
+  INTEGER(out)[0] = num_threads;
+  return R_NilValue;
+  R_API_END();
+}
+
+SEXP LGBM_SetMaxThreads_R(SEXP num_threads) {
+  R_API_BEGIN();
+  int new_num_threads = Rf_asInteger(num_threads);
+  CHECK_CALL(LGBM_SetMaxThreads(new_num_threads));
+  return R_NilValue;
+  R_API_END();
+}
+
 // .Call() calls
 static const R_CallMethodDef CallEntries[] = {
   {"LGBM_HandleIsNull_R"                         , (DL_FUNC) &LGBM_HandleIsNull_R                         , 1},
@@ -1260,6 +1266,8 @@ static const R_CallMethodDef CallEntries[] = {
   {"LGBM_BoosterDumpModel_R"                     , (DL_FUNC) &LGBM_BoosterDumpModel_R                     , 3},
   {"LGBM_NullBoosterHandleError_R"               , (DL_FUNC) &LGBM_NullBoosterHandleError_R               , 0},
   {"LGBM_DumpParamAliases_R"                     , (DL_FUNC) &LGBM_DumpParamAliases_R                     , 0},
+  {"LGBM_GetMaxThreads_R"                        , (DL_FUNC) &LGBM_GetMaxThreads_R                        , 1},
+  {"LGBM_SetMaxThreads_R"                        , (DL_FUNC) &LGBM_SetMaxThreads_R                        , 1},
   {NULL, NULL, 0}
 };
 
diff --git a/R-package/src/lightgbm_R.h b/R-package/src/lightgbm_R.h
index 7141a06a207c..4f0407e8f2ec 100644
--- a/R-package/src/lightgbm_R.h
+++ b/R-package/src/lightgbm_R.h
@@ -850,4 +850,23 @@ LIGHTGBM_C_EXPORT SEXP LGBM_BoosterDumpModel_R(
 */
 LIGHTGBM_C_EXPORT SEXP LGBM_DumpParamAliases_R();
 
+/*!
+* \brief Get current maximum number of threads used by LightGBM routines in this process.
+* \param[out] out current maximum number of threads used by LightGBM. -1 means defaulting to omp_get_num_threads().
+* \return R NULL value
+*/
+LIGHTGBM_C_EXPORT SEXP LGBM_GetMaxThreads_R(
+  SEXP out
+);
+
+
+/*!
+* \brief Set maximum number of threads used by LightGBM routines in this process.
+* \param num_threads maximum number of threads used by LightGBM. -1 means defaulting to omp_get_num_threads().
+* \return R NULL value
+*/
+LIGHTGBM_C_EXPORT SEXP LGBM_SetMaxThreads_R(
+  SEXP num_threads
+);
+
 #endif  // LIGHTGBM_R_H_
diff --git a/R-package/tests/testthat.R b/R-package/tests/testthat.R
index 4752952740b2..72a440c275b0 100644
--- a/R-package/tests/testthat.R
+++ b/R-package/tests/testthat.R
@@ -1,5 +1,5 @@
 library(testthat)
-library(lightgbm)
+library(lightgbm)  # nolint: [unused_import]
 
 test_check(
     package = "lightgbm"
diff --git a/R-package/tests/testthat/helper.R b/R-package/tests/testthat/helper.R
index 9c928c1f71d1..45edf40efbeb 100644
--- a/R-package/tests/testthat/helper.R
+++ b/R-package/tests/testthat/helper.R
@@ -11,6 +11,11 @@
 #   the check farm is a shared resource and will typically be running many checks simultaneously.
 #
 .LGB_MAX_THREADS <- 2L
+setLGBMthreads(.LGB_MAX_THREADS)
+
+# control data.table parallelism
+# ref: https://github.com/Rdatatable/data.table/issues/5658
+data.table::setDTthreads(1L)
 
 # by default, how much should results in tests be allowed to differ from hard-coded expected numbers?
 .LGB_NUMERIC_TOLERANCE <- 1e-6
diff --git a/R-package/tests/testthat/test_Predictor.R b/R-package/tests/testthat/test_Predictor.R
index 90be1d08cf67..192171c915bf 100644
--- a/R-package/tests/testthat/test_Predictor.R
+++ b/R-package/tests/testthat/test_Predictor.R
@@ -17,16 +17,16 @@ test_that("Predictor$finalize() should not fail", {
     bst$save_model(filename = model_file)
     predictor <- Predictor$new(modelfile = model_file)
 
-    expect_true(lgb.is.Predictor(predictor))
+    expect_true(.is_Predictor(predictor))
 
-    expect_false(lgb.is.null.handle(predictor$.__enclos_env__$private$handle))
+    expect_false(.is_null_handle(predictor$.__enclos_env__$private$handle))
 
     predictor$finalize()
-    expect_true(lgb.is.null.handle(predictor$.__enclos_env__$private$handle))
+    expect_true(.is_null_handle(predictor$.__enclos_env__$private$handle))
 
     # calling finalize() a second time shouldn't cause any issues
     predictor$finalize()
-    expect_true(lgb.is.null.handle(predictor$.__enclos_env__$private$handle))
+    expect_true(.is_null_handle(predictor$.__enclos_env__$private$handle))
 })
 
 test_that("predictions do not fail for integer input", {
@@ -79,7 +79,7 @@ test_that("start_iteration works correctly", {
         , valids = list("test" = dtest)
         , early_stopping_rounds = 2L
     )
-    expect_true(lgb.is.Booster(bst))
+    expect_true(.is_Booster(bst))
     pred1 <- predict(bst, newdata = test$data, type = "raw")
     pred_contrib1 <- predict(bst, test$data, type = "contrib")
     pred2 <- rep(0.0, length(pred1))
diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R
index 57c33c35dfee..75abd26dd152 100644
--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@@ -1094,7 +1094,7 @@ test_that("lgb.train() works as expected with sparse features", {
     , nrounds = nrounds
   )
 
-  expect_true(lgb.is.Booster(bst))
+  expect_true(.is_Booster(bst))
   expect_equal(bst$current_iter(), nrounds)
   parsed_model <- jsonlite::fromJSON(bst$dump_model())
   expect_equal(parsed_model$objective, "binary sigmoid:1")
@@ -1816,7 +1816,7 @@ test_that("lgb.train() supports non-ASCII feature names", {
     )
     , colnames = feature_names
   )
-  expect_true(lgb.is.Booster(bst))
+  expect_true(.is_Booster(bst))
   dumped_model <- jsonlite::fromJSON(bst$dump_model())
 
   # UTF-8 strings are not well-supported on Windows
@@ -2522,7 +2522,7 @@ test_that("lgb.train() fit on linearly-relatead data improves when using linear
     , params = params
     , valids = list("train" = dtrain)
   )
-  expect_true(lgb.is.Booster(bst))
+  expect_true(.is_Booster(bst))
 
   dtrain <- .new_dataset()
   bst_linear <- lgb.train(
@@ -2531,7 +2531,7 @@ test_that("lgb.train() fit on linearly-relatead data improves when using linear
     , params = utils::modifyList(params, list(linear_tree = TRUE))
     , valids = list("train" = dtrain)
   )
-  expect_true(lgb.is.Booster(bst_linear))
+  expect_true(.is_Booster(bst_linear))
 
   bst_last_mse <- bst$record_evals[["train"]][["l2"]][["eval"]][[10L]]
   bst_lin_last_mse <- bst_linear$record_evals[["train"]][["l2"]][["eval"]][[10L]]
@@ -2599,7 +2599,7 @@ test_that("lgb.train() works with linear learners even if Dataset has missing va
     , params = params
     , valids = list("train" = dtrain)
   )
-  expect_true(lgb.is.Booster(bst))
+  expect_true(.is_Booster(bst))
 
   dtrain <- .new_dataset()
   bst_linear <- lgb.train(
@@ -2608,7 +2608,7 @@ test_that("lgb.train() works with linear learners even if Dataset has missing va
     , params = utils::modifyList(params, list(linear_tree = TRUE))
     , valids = list("train" = dtrain)
   )
-  expect_true(lgb.is.Booster(bst_linear))
+  expect_true(.is_Booster(bst_linear))
 
   bst_last_mse <- bst$record_evals[["train"]][["l2"]][["eval"]][[10L]]
   bst_lin_last_mse <- bst_linear$record_evals[["train"]][["l2"]][["eval"]][[10L]]
@@ -2649,7 +2649,7 @@ test_that("lgb.train() works with linear learners, bagging, and a Dataset that h
     , params = params
     , valids = list("train" = dtrain)
   )
-  expect_true(lgb.is.Booster(bst))
+  expect_true(.is_Booster(bst))
 
   dtrain <- .new_dataset()
   bst_linear <- lgb.train(
@@ -2658,7 +2658,7 @@ test_that("lgb.train() works with linear learners, bagging, and a Dataset that h
     , params = utils::modifyList(params, list(linear_tree = TRUE))
     , valids = list("train" = dtrain)
   )
-  expect_true(lgb.is.Booster(bst_linear))
+  expect_true(.is_Booster(bst_linear))
 
   bst_last_mse <- bst$record_evals[["train"]][["l2"]][["eval"]][[10L]]
   bst_lin_last_mse <- bst_linear$record_evals[["train"]][["l2"]][["eval"]][[10L]]
@@ -2699,7 +2699,7 @@ test_that("lgb.train() works with linear learners and data where a feature has o
     , nrounds = 10L
     , params = utils::modifyList(params, list(linear_tree = TRUE))
   )
-  expect_true(lgb.is.Booster(bst_linear))
+  expect_true(.is_Booster(bst_linear))
 })
 
 test_that("lgb.train() works with linear learners when Dataset has categorical features", {
@@ -2732,7 +2732,7 @@ test_that("lgb.train() works with linear learners when Dataset has categorical f
     , params = params
     , valids = list("train" = dtrain)
   )
-  expect_true(lgb.is.Booster(bst))
+  expect_true(.is_Booster(bst))
 
   dtrain <- .new_dataset()
   bst_linear <- lgb.train(
@@ -2741,7 +2741,7 @@ test_that("lgb.train() works with linear learners when Dataset has categorical f
     , params = utils::modifyList(params, list(linear_tree = TRUE))
     , valids = list("train" = dtrain)
   )
-  expect_true(lgb.is.Booster(bst_linear))
+  expect_true(.is_Booster(bst_linear))
 
   bst_last_mse <- bst$record_evals[["train"]][["l2"]][["eval"]][[10L]]
   bst_lin_last_mse <- bst_linear$record_evals[["train"]][["l2"]][["eval"]][[10L]]
@@ -3805,3 +3805,26 @@ test_that("lightgbm() correctly sets objective when passing lgb.Dataset as input
   )
   expect_equal(model$params$objective, "regression")
 })
+
+test_that("Evaluation metrics aren't printed as a single-element vector", {
+  log_txt <- capture_output({
+    data(mtcars)
+    y <- mtcars$mpg
+    x <- as.matrix(mtcars[, -1L])
+    cv_result <- lgb.cv(
+        data = lgb.Dataset(x, label = y)
+        , params = list(
+            objective = "regression"
+            , metric = "l2"
+            , min_data_in_leaf = 5L
+            , max_depth = 3L
+            , num_threads = .LGB_MAX_THREADS
+        )
+        , nrounds = 2L
+        , nfold = 3L
+        , verbose = 1L
+        , eval_train_metric = TRUE
+    )
+  })
+  expect_false(grepl("[1] \"[1]", log_txt, fixed = TRUE))
+})
diff --git a/R-package/tests/testthat/test_dataset.R b/R-package/tests/testthat/test_dataset.R
index cf68ce9262a3..a8585baa2621 100644
--- a/R-package/tests/testthat/test_dataset.R
+++ b/R-package/tests/testthat/test_dataset.R
@@ -183,7 +183,9 @@ test_that("lgb.Dataset: colnames", {
     colnames(dtest) <- "asdf"
   })
   new_names <- make.names(seq_len(ncol(test_data)))
-  expect_silent(colnames(dtest) <- new_names)
+  expect_silent({
+    colnames(dtest) <- new_names
+  })
   expect_equal(colnames(dtest), new_names)
 })
 
@@ -204,7 +206,7 @@ test_that("lgb.Dataset: Dataset should be able to construct from matrix and retu
     , rawData
     , nrow(rawData)
     , ncol(rawData)
-    , lightgbm:::lgb.params2str(params = list())
+    , lightgbm:::.params2str(params = list())
     , ref_handle
   )
   expect_true(methods::is(handle, "externalptr"))
@@ -320,7 +322,7 @@ test_that("Dataset$update_parameters() does nothing for empty inputs", {
   res <- ds$update_params(
     params = list()
   )
-  expect_true(lgb.is.Dataset(res))
+  expect_true(.is_Dataset(res))
 
   new_params <- ds$get_params()
   expect_identical(new_params, initial_params)
@@ -341,7 +343,7 @@ test_that("Dataset$update_params() works correctly for recognized Dataset parame
   res <- ds$update_params(
     params = new_params
   )
-  expect_true(lgb.is.Dataset(res))
+  expect_true(.is_Dataset(res))
 
   updated_params <- ds$get_params()
   for (param_name in names(new_params)) {
@@ -354,17 +356,17 @@ test_that("Dataset$finalize() should not fail on an already-finalized Dataset",
     data = test_data
     , label = test_label
   )
-  expect_true(lgb.is.null.handle(dtest$.__enclos_env__$private$handle))
+  expect_true(.is_null_handle(dtest$.__enclos_env__$private$handle))
 
   dtest$construct()
-  expect_false(lgb.is.null.handle(dtest$.__enclos_env__$private$handle))
+  expect_false(.is_null_handle(dtest$.__enclos_env__$private$handle))
 
   dtest$finalize()
-  expect_true(lgb.is.null.handle(dtest$.__enclos_env__$private$handle))
+  expect_true(.is_null_handle(dtest$.__enclos_env__$private$handle))
 
   # calling finalize() a second time shouldn't cause any issues
   dtest$finalize()
-  expect_true(lgb.is.null.handle(dtest$.__enclos_env__$private$handle))
+  expect_true(.is_null_handle(dtest$.__enclos_env__$private$handle))
 })
 
 test_that("lgb.Dataset: should be able to run lgb.train() immediately after using lgb.Dataset() on a file", {
@@ -399,7 +401,7 @@ test_that("lgb.Dataset: should be able to run lgb.train() immediately after usin
     , data = dtest_read_in
   )
 
-  expect_true(lgb.is.Booster(x = bst))
+  expect_true(.is_Booster(x = bst))
 })
 
 test_that("lgb.Dataset: should be able to run lgb.cv() immediately after using lgb.Dataset() on a file", {
diff --git a/R-package/tests/testthat/test_learning_to_rank.R b/R-package/tests/testthat/test_learning_to_rank.R
index 6868794cf8ec..e99aff44ceb3 100644
--- a/R-package/tests/testthat/test_learning_to_rank.R
+++ b/R-package/tests/testthat/test_learning_to_rank.R
@@ -25,7 +25,7 @@ test_that("learning-to-rank with lgb.train() works as expected", {
         , data = dtrain
         , nrounds = 10L
     )
-    expect_true(lgb.is.Booster(model))
+    expect_true(.is_Booster(model))
 
     dumped_model <- jsonlite::fromJSON(
         model$dump_model()
@@ -37,7 +37,8 @@ test_that("learning-to-rank with lgb.train() works as expected", {
     eval_results <- model$eval_train()
     expect_equal(length(eval_results), length(eval_names))
     for (result in eval_results) {
-        expect_true(result[["value"]] > 0.0 && result[["value"]] < 1.0)
+        expect_true(result[["value"]] > 0.0)
+        expect_true(result[["value"]] < 1.0)
         expect_true(result[["higher_better"]])
         expect_identical(result[["data_name"]], "training")
     }
@@ -104,8 +105,10 @@ test_that("learning-to-rank with lgb.cv() works as expected", {
     # check that best score and iter make sense (0.0 < nDCG < 1.0)
     best_iter <- cv_bst$best_iter
     best_score <- cv_bst$best_score
-    expect_true(best_iter > 0L && best_iter <= nrounds)
-    expect_true(best_score > 0.0 && best_score < 1.0)
+    expect_true(best_iter > 0L)
+    expect_true(best_iter <= nrounds)
+    expect_true(best_score > 0.0)
+    expect_true(best_score < 1.0)
     expect_true(abs(best_score - 0.75) < .LGB_NUMERIC_TOLERANCE)
 
     # best_score should be set for the first metric
diff --git a/R-package/tests/testthat/test_lgb.Booster.R b/R-package/tests/testthat/test_lgb.Booster.R
index 5f398f1c081d..e6b0e8abda64 100644
--- a/R-package/tests/testthat/test_lgb.Booster.R
+++ b/R-package/tests/testthat/test_lgb.Booster.R
@@ -11,16 +11,16 @@ test_that("Booster$finalize() should not fail", {
         , verbose = .LGB_VERBOSITY
         , nrounds = 3L
     )
-    expect_true(lgb.is.Booster(bst))
+    expect_true(.is_Booster(bst))
 
-    expect_false(lgb.is.null.handle(bst$.__enclos_env__$private$handle))
+    expect_false(.is_null_handle(bst$.__enclos_env__$private$handle))
 
     bst$finalize()
-    expect_true(lgb.is.null.handle(bst$.__enclos_env__$private$handle))
+    expect_true(.is_null_handle(bst$.__enclos_env__$private$handle))
 
     # calling finalize() a second time shouldn't cause any issues
     bst$finalize()
-    expect_true(lgb.is.null.handle(bst$.__enclos_env__$private$handle))
+    expect_true(.is_null_handle(bst$.__enclos_env__$private$handle))
 })
 
 test_that("lgb.get.eval.result() should throw an informative error if booster is not an lgb.Booster", {
@@ -188,7 +188,7 @@ test_that("Loading a Booster from a text file works", {
         , params = params
         , nrounds = 2L
     )
-    expect_true(lgb.is.Booster(bst))
+    expect_true(.is_Booster(bst))
 
     pred <- predict(bst, test$data)
     model_file <- tempfile(fileext = ".model")
@@ -232,7 +232,7 @@ test_that("boosters with linear models at leaves can be written to text file and
         , params = params
         , verbose = .LGB_VERBOSITY
     )
-    expect_true(lgb.is.Booster(bst))
+    expect_true(.is_Booster(bst))
 
     # save predictions, then write the model to a file and destroy it in R
     preds <- predict(bst, X)
@@ -269,7 +269,7 @@ test_that("Loading a Booster from a string works", {
         )
         , nrounds = 2L
     )
-    expect_true(lgb.is.Booster(bst))
+    expect_true(.is_Booster(bst))
 
     pred <- predict(bst, test$data)
     model_string <- bst$save_model_to_string()
@@ -376,7 +376,7 @@ test_that("If a string and a file are both passed to lgb.load() the file is used
         )
         , nrounds = 2L
     )
-    expect_true(lgb.is.Booster(bst))
+    expect_true(.is_Booster(bst))
 
     pred <- predict(bst, test$data)
     model_file <- tempfile(fileext = ".model")
@@ -411,7 +411,7 @@ test_that("Creating a Booster from a Dataset should work", {
         ),
         train_set = dtrain
     )
-    expect_true(lgb.is.Booster(bst))
+    expect_true(.is_Booster(bst))
     expect_equal(bst$current_iter(), 0L)
     expect_true(is.na(bst$best_score))
     expect_true(all(bst$predict(agaricus.train$data) == 0.5))
@@ -446,10 +446,10 @@ test_that("Creating a Booster from a Dataset with an existing predictor should w
             , num_threads = .LGB_MAX_THREADS
         )
     )
-    expect_true(lgb.is.Booster(bst))
+    expect_true(.is_Booster(bst))
     expect_equal(bst$current_iter(), nrounds)
     expect_equal(bst$eval_train()[[1L]][["value"]], 0.1115352)
-    expect_true(lgb.is.Booster(bst_from_ds))
+    expect_true(.is_Booster(bst_from_ds))
     expect_equal(bst_from_ds$current_iter(), nrounds)
     expect_equal(bst_from_ds$eval_train()[[1L]][["value"]], 5.65704892)
     dumped_model <- jsonlite::fromJSON(bst$dump_model())
@@ -531,7 +531,7 @@ test_that("Booster$rollback_one_iter() should work as expected", {
         , nrounds = nrounds
     )
     expect_equal(bst$current_iter(), nrounds)
-    expect_true(lgb.is.Booster(bst))
+    expect_true(.is_Booster(bst))
     logloss <- bst$eval_train()[[1L]][["value"]]
     expect_equal(logloss, 0.01904786)
 
@@ -539,7 +539,7 @@ test_that("Booster$rollback_one_iter() should work as expected", {
 
     # rollback_one_iter() should return a booster and modify the original
     # booster in place
-    expect_true(lgb.is.Booster(x))
+    expect_true(.is_Booster(x))
     expect_equal(bst$current_iter(), nrounds - 1L)
 
     # score should now come from the model as of 4 iterations
@@ -565,7 +565,7 @@ test_that("Booster$update() passing a train_set works as expected", {
         )
         , nrounds = nrounds
     )
-    expect_true(lgb.is.Booster(bst))
+    expect_true(.is_Booster(bst))
     expect_equal(bst$current_iter(), nrounds)
     bst$update(
         train_set = Dataset$new(
@@ -574,7 +574,7 @@ test_that("Booster$update() passing a train_set works as expected", {
             , params = list(verbose = .LGB_VERBOSITY)
         )
     )
-    expect_true(lgb.is.Booster(bst))
+    expect_true(.is_Booster(bst))
     expect_equal(bst$current_iter(), nrounds + 1L)
 
     # train with 3 rounds directly
@@ -590,7 +590,7 @@ test_that("Booster$update() passing a train_set works as expected", {
         )
         , nrounds = nrounds +  1L
     )
-    expect_true(lgb.is.Booster(bst2))
+    expect_true(.is_Booster(bst2))
     expect_equal(bst2$current_iter(), nrounds +  1L)
 
     # model with 2 rounds + 1 update should be identical to 3 rounds
@@ -716,7 +716,7 @@ test_that("Saving a model with different feature importance types works", {
         )
         , nrounds = 2L
     )
-    expect_true(lgb.is.Booster(bst))
+    expect_true(.is_Booster(bst))
 
     .feat_importance_from_string <- function(model_string) {
         file_lines <- strsplit(model_string, "\n", fixed = TRUE)[[1L]]
@@ -772,7 +772,7 @@ test_that("Saving a model with unknown importance type fails", {
         )
         , nrounds = 2L
     )
-    expect_true(lgb.is.Booster(bst))
+    expect_true(.is_Booster(bst))
 
     UNSUPPORTED_IMPORTANCE <- 2L
     expect_error({
@@ -1252,43 +1252,6 @@ test_that("lgb.cv() correctly handles passing through params to the model file",
 
 })
 
-test_that("params (including dataset params) should be stored in .rds file for Booster", {
-    data(agaricus.train, package = "lightgbm")
-    dtrain <- lgb.Dataset(
-        agaricus.train$data
-        , label = agaricus.train$label
-        , params = list(
-            max_bin = 17L
-        )
-    )
-    params <- list(
-        objective = "binary"
-        , max_depth = 4L
-        , bagging_fraction = 0.8
-        , verbose = .LGB_VERBOSITY
-        , num_threads = .LGB_MAX_THREADS
-    )
-    bst <- Booster$new(
-        params = params
-        , train_set = dtrain
-    )
-    bst_file <- tempfile(fileext = ".rds")
-    expect_warning(saveRDS.lgb.Booster(bst, file = bst_file))
-
-    expect_warning(bst_from_file <- readRDS.lgb.Booster(file = bst_file))
-    expect_identical(
-        bst_from_file$params
-        , list(
-            objective = "binary"
-            , max_depth = 4L
-            , bagging_fraction = 0.8
-            , verbose = .LGB_VERBOSITY
-            , num_threads = .LGB_MAX_THREADS
-            , max_bin = 17L
-        )
-    )
-})
-
 test_that("params (including dataset params) should be stored in .rds file for Booster", {
     data(agaricus.train, package = "lightgbm")
     dtrain <- lgb.Dataset(
@@ -1348,46 +1311,6 @@ test_that("Handle is automatically restored when calling predict", {
     expect_equal(pred_before, pred_after)
 })
 
-test_that("boosters with linear models at leaves work with saveRDS.lgb.Booster and readRDS.lgb.Booster", {
-    X <- matrix(rnorm(100L), ncol = 1L)
-    labels <- 2L * X + runif(nrow(X), 0L, 0.1)
-    dtrain <- lgb.Dataset(
-        data = X
-        , label = labels
-    )
-
-    params <- list(
-        objective = "regression"
-        , verbose = .LGB_VERBOSITY
-        , metric = "mse"
-        , seed = 0L
-        , num_leaves = 2L
-        , num_threads = .LGB_MAX_THREADS
-    )
-
-    bst <- lgb.train(
-        data = dtrain
-        , nrounds = 10L
-        , params = params
-    )
-    expect_true(lgb.is.Booster(bst))
-
-    # save predictions, then write the model to a file and destroy it in R
-    preds <- predict(bst, X)
-    model_file <- tempfile(fileext = ".rds")
-    expect_warning(saveRDS.lgb.Booster(bst, file = model_file))
-    bst$finalize()
-    expect_null(bst$.__enclos_env__$private$handle)
-    rm(bst)
-
-    # load the booster and make predictions...should be the same
-    expect_warning({
-        bst2 <- readRDS.lgb.Booster(file = model_file)
-    })
-    preds2 <- predict(bst2, X)
-    expect_identical(preds, preds2)
-})
-
 test_that("boosters with linear models at leaves can be written to RDS and re-loaded successfully", {
     X <- matrix(rnorm(100L), ncol = 1L)
     labels <- 2L * X + runif(nrow(X), 0L, 0.1)
@@ -1410,7 +1333,7 @@ test_that("boosters with linear models at leaves can be written to RDS and re-lo
         , nrounds = 10L
         , params = params
     )
-    expect_true(lgb.is.Booster(bst))
+    expect_true(.is_Booster(bst))
 
     # save predictions, then write the model to a file and destroy it in R
     preds <- predict(bst, X)
diff --git a/R-package/tests/testthat/test_multithreading.R b/R-package/tests/testthat/test_multithreading.R
new file mode 100644
index 000000000000..e2f3169627a2
--- /dev/null
+++ b/R-package/tests/testthat/test_multithreading.R
@@ -0,0 +1,16 @@
+test_that("getLGBMthreads() and setLGBMthreads() work as expected", {
+    # works with integer input
+    ret <- setLGBMthreads(2L)
+    expect_null(ret)
+    expect_equal(getLGBMthreads(), 2L)
+
+    # works with float input
+    ret <- setLGBMthreads(1.0)
+    expect_null(ret)
+    expect_equal(getLGBMthreads(), 1L)
+
+    # setting to any negative number sets max threads to -1
+    ret <- setLGBMthreads(-312L)
+    expect_null(ret)
+    expect_equal(getLGBMthreads(), -1L)
+})
diff --git a/R-package/tests/testthat/test_utils.R b/R-package/tests/testthat/test_utils.R
index 4ab05e075ae3..898aed9b0915 100644
--- a/R-package/tests/testthat/test_utils.R
+++ b/R-package/tests/testthat/test_utils.R
@@ -1,12 +1,12 @@
-test_that("lgb.params2str() works as expected for empty lists", {
-    out_str <- lgb.params2str(
+test_that(".params2str() works as expected for empty lists", {
+    out_str <- .params2str(
         params = list()
     )
     expect_identical(class(out_str), "character")
     expect_equal(out_str, "")
 })
 
-test_that("lgb.params2str() works as expected for a key in params with multiple different-length elements", {
+test_that(".params2str() works as expected for a key in params with multiple different-length elements", {
     metrics <- c("a", "ab", "abc", "abcdefg")
     params <- list(
         objective = "magic"
@@ -14,7 +14,7 @@ test_that("lgb.params2str() works as expected for a key in params with multiple
         , nrounds = 10L
         , learning_rate = 0.0000001
     )
-    out_str <- lgb.params2str(
+    out_str <- .params2str(
         params = params
     )
     expect_identical(class(out_str), "character")
@@ -24,8 +24,8 @@ test_that("lgb.params2str() works as expected for a key in params with multiple
     )
 })
 
-test_that("lgb.params2str() passes through duplicated params", {
-    out_str <- lgb.params2str(
+test_that(".params2str() passes through duplicated params", {
+    out_str <- .params2str(
         params = list(
             objective = "regression"
             , bagging_fraction = 0.8
@@ -35,8 +35,8 @@ test_that("lgb.params2str() passes through duplicated params", {
     expect_equal(out_str, "objective=regression bagging_fraction=0.8 bagging_fraction=0.5")
 })
 
-test_that("lgb.check.eval works as expected with no metric", {
-    params <- lgb.check.eval(
+test_that(".check_eval works as expected with no metric", {
+    params <- .check_eval(
         params = list(device = "cpu")
         , eval = "binary_error"
     )
@@ -44,8 +44,8 @@ test_that("lgb.check.eval works as expected with no metric", {
     expect_identical(params[["metric"]], list("binary_error"))
 })
 
-test_that("lgb.check.eval adds eval to metric in params", {
-    params <- lgb.check.eval(
+test_that(".check_eval adds eval to metric in params", {
+    params <- .check_eval(
         params = list(metric = "auc")
         , eval = "binary_error"
     )
@@ -53,8 +53,8 @@ test_that("lgb.check.eval adds eval to metric in params", {
     expect_identical(params[["metric"]], list("auc", "binary_error"))
 })
 
-test_that("lgb.check.eval adds eval to metric in params if two evaluation names are provided", {
-    params <- lgb.check.eval(
+test_that(".check_eval adds eval to metric in params if two evaluation names are provided", {
+    params <- .check_eval(
         params = list(metric = "auc")
         , eval = c("binary_error", "binary_logloss")
     )
@@ -62,8 +62,8 @@ test_that("lgb.check.eval adds eval to metric in params if two evaluation names
     expect_identical(params[["metric"]], list("auc", "binary_error", "binary_logloss"))
 })
 
-test_that("lgb.check.eval adds eval to metric in params if a list is provided", {
-    params <- lgb.check.eval(
+test_that(".check_eval adds eval to metric in params if a list is provided", {
+    params <- .check_eval(
         params = list(metric = "auc")
         , eval = list("binary_error", "binary_logloss")
     )
@@ -71,8 +71,8 @@ test_that("lgb.check.eval adds eval to metric in params if a list is provided",
     expect_identical(params[["metric"]], list("auc", "binary_error", "binary_logloss"))
 })
 
-test_that("lgb.check.eval drops duplicate metrics and preserves order", {
-    params <- lgb.check.eval(
+test_that(".check_eval drops duplicate metrics and preserves order", {
+    params <- .check_eval(
         params = list(metric = "l1")
         , eval = list("l2", "rmse", "l1", "rmse")
     )
@@ -80,9 +80,9 @@ test_that("lgb.check.eval drops duplicate metrics and preserves order", {
     expect_identical(params[["metric"]], list("l1", "l2", "rmse"))
 })
 
-test_that("lgb.check.wrapper_param() uses passed-in keyword arg if no alias found in params", {
+test_that(".check_wrapper_param() uses passed-in keyword arg if no alias found in params", {
     kwarg_val <- sample(seq_len(100L), size = 1L)
-    params <- lgb.check.wrapper_param(
+    params <- .check_wrapper_param(
         main_param_name = "num_iterations"
         , params = list()
         , alternative_kwarg_value = kwarg_val
@@ -90,10 +90,10 @@ test_that("lgb.check.wrapper_param() uses passed-in keyword arg if no alias foun
     expect_equal(params[["num_iterations"]], kwarg_val)
 })
 
-test_that("lgb.check.wrapper_param() prefers main parameter to alias and keyword arg", {
+test_that(".check_wrapper_param() prefers main parameter to alias and keyword arg", {
     num_iterations <- sample(seq_len(100L), size = 1L)
     kwarg_val <- sample(seq_len(100L), size = 1L)
-    params <- lgb.check.wrapper_param(
+    params <- .check_wrapper_param(
         main_param_name = "num_iterations"
         , params = list(
             num_iterations = num_iterations
@@ -108,11 +108,11 @@ test_that("lgb.check.wrapper_param() prefers main parameter to alias and keyword
     expect_identical(params, list(num_iterations = num_iterations))
 })
 
-test_that("lgb.check.wrapper_param() prefers alias to keyword arg", {
+test_that(".check_wrapper_param() prefers alias to keyword arg", {
     n_estimators <- sample(seq_len(100L), size = 1L)
     num_tree <- sample(seq_len(100L), size = 1L)
     kwarg_val <- sample(seq_len(100L), size = 1L)
-    params <- lgb.check.wrapper_param(
+    params <- .check_wrapper_param(
         main_param_name = "num_iterations"
         , params = list(
             num_tree = num_tree
@@ -124,7 +124,7 @@ test_that("lgb.check.wrapper_param() prefers alias to keyword arg", {
     expect_identical(params, list(num_iterations = num_tree))
 
     # switching the order shouldn't switch which one is chosen
-    params2 <- lgb.check.wrapper_param(
+    params2 <- .check_wrapper_param(
         main_param_name = "num_iterations"
         , params = list(
             n_estimators = n_estimators
@@ -136,14 +136,14 @@ test_that("lgb.check.wrapper_param() prefers alias to keyword arg", {
     expect_identical(params2, list(num_iterations = num_tree))
 })
 
-test_that("lgb.equal.or.both.null produces expected results", {
-    expect_true(lgb.equal.or.both.null(NULL, NULL))
-    expect_false(lgb.equal.or.both.null(1.0, NULL))
-    expect_false(lgb.equal.or.both.null(NULL, 1.0))
-    expect_true(lgb.equal.or.both.null(1.0, 1.0))
-    expect_true(lgb.equal.or.both.null(1.0, 1L))
-    expect_false(lgb.equal.or.both.null(NA, NULL))
-    expect_false(lgb.equal.or.both.null(NULL, NA))
-    expect_false(lgb.equal.or.both.null(10.0, 1L))
-    expect_true(lgb.equal.or.both.null(0L, 0L))
+test_that(".equal_or_both_null produces expected results", {
+    expect_true(.equal_or_both_null(NULL, NULL))
+    expect_false(.equal_or_both_null(1.0, NULL))
+    expect_false(.equal_or_both_null(NULL, 1.0))
+    expect_true(.equal_or_both_null(1.0, 1.0))
+    expect_true(.equal_or_both_null(1.0, 1L))
+    expect_false(.equal_or_both_null(NA, NULL))
+    expect_false(.equal_or_both_null(NULL, NA))
+    expect_false(.equal_or_both_null(10.0, 1L))
+    expect_true(.equal_or_both_null(0L, 0L))
 })
diff --git a/R-package/vignettes/basic_walkthrough.Rmd b/R-package/vignettes/basic_walkthrough.Rmd
index d7aaf676f386..01e2410d501e 100644
--- a/R-package/vignettes/basic_walkthrough.Rmd
+++ b/R-package/vignettes/basic_walkthrough.Rmd
@@ -3,10 +3,14 @@ title:
   "Basic Walkthrough"
 description: >
   This vignette describes how to train a LightGBM model for binary classification.
-output: rmarkdown::html_vignette
+output:
+  markdown::html_format:
+    options:
+      toc: true
+      number_sections: true
 vignette: >
   %\VignetteIndexEntry{Basic Walkthrough}
-  %\VignetteEngine{knitr::rmarkdown}
+  %\VignetteEngine{knitr::knitr}
   %\VignetteEncoding{UTF-8}
 ---
 
@@ -23,10 +27,16 @@ knitr::opts_chunk$set(
 
 Welcome to the world of [LightGBM](https://lightgbm.readthedocs.io/en/latest/), a highly efficient gradient boosting implementation (Ke et al. 2017).
 
-```{r setup}
+```{r}
 library(lightgbm)
 ```
 
+```{r, include=FALSE}
+# limit number of threads used, to be respectful of CRAN's resources when it checks this vignette
+data.table::setDTthreads(1L)
+setLGBMthreads(2L)
+```
+
 This vignette will guide you through its basic usage. It will show how to build a simple binary classification model based on a subset of the `bank` dataset (Moro, Cortez, and Rita 2014). You will use the two input features "age" and "balance" to predict whether a client has subscribed a term deposit.
 
 ## The dataset
diff --git a/README.md b/README.md
index f6f4e8c570e0..3b3fe40790db 100644
--- a/README.md
+++ b/README.md
@@ -41,7 +41,6 @@ Next you may want to read:
 - [**Features**](https://github.com/microsoft/LightGBM/blob/master/docs/Features.rst) and algorithms supported by LightGBM.
 - [**Parameters**](https://github.com/microsoft/LightGBM/blob/master/docs/Parameters.rst) is an exhaustive list of customization you can make.
 - [**Distributed Learning**](https://github.com/microsoft/LightGBM/blob/master/docs/Parallel-Learning-Guide.rst) and [**GPU Learning**](https://github.com/microsoft/LightGBM/blob/master/docs/GPU-Tutorial.rst) can speed up computation.
-- [**Laurae++ interactive documentation**](https://sites.google.com/view/lauraepp/parameters) is a detailed guide for hyperparameters.
 - [**FLAML**](https://www.microsoft.com/en-us/research/project/fast-and-lightweight-automl-for-large-scale-data/articles/flaml-a-fast-and-lightweight-automl-library/) provides automated tuning for LightGBM ([code examples](https://microsoft.github.io/FLAML/docs/Examples/AutoML-for-LightGBM/)).
 - [**Optuna Hyperparameter Tuner**](https://medium.com/optuna/lightgbm-tuner-new-optuna-integration-for-hyperparameter-optimization-8b7095e99258) provides automated tuning for LightGBM hyperparameters ([code examples](https://github.com/optuna/optuna-examples/blob/main/lightgbm/lightgbm_tuner_simple.py)).
 - [**Understanding LightGBM Parameters (and How to Tune Them using Neptune)**](https://neptune.ai/blog/lightgbm-parameters-guide).
@@ -64,6 +63,8 @@ External (Unofficial) Repositories
 Projects listed here offer alternative ways to use LightGBM.
 They are not maintained or officially endorsed by the `LightGBM` development team.
 
+LightGBMLSS (An extension of LightGBM to probabilistic modelling from which prediction intervals and quantiles can be derived): https://github.com/StatMixedML/LightGBMLSS
+
 FLAML (AutoML library for hyperparameter optimization): https://github.com/microsoft/FLAML
 
 Optuna (hyperparameter optimization framework): https://github.com/optuna/optuna
diff --git a/VERSION.txt b/VERSION.txt
index 1f06da0058c9..67dae3929837 100644
--- a/VERSION.txt
+++ b/VERSION.txt
@@ -1 +1 @@
-4.1.0.99
+4.2.0.99
diff --git a/build-cran-package.sh b/build-cran-package.sh
index 6558afcf77ad..9fa0c5877085 100755
--- a/build-cran-package.sh
+++ b/build-cran-package.sh
@@ -132,7 +132,7 @@ cd "${TEMP_R_DIR}"
     using_windows_and_r3=$(
         Rscript -e 'cat(.Platform$OS.type == "windows" && R.version[["major"]] < 4)'
     )
-    if [[ ${using_windows_and_r3} == "TRUE" ]]; then
+    if test "${using_windows_and_r3}" = "TRUE"; then
         LGB_CXX_STD="C++11"
     fi
     sed -i.bak -e "s/~~CXXSTD~~/${LGB_CXX_STD}/" DESCRIPTION
@@ -227,6 +227,7 @@ if ${BUILD_VIGNETTES} ; then
         rm -f ./lightgbm/src/network/*.o
         rm -f ./lightgbm/src/objective/*.o
         rm -f ./lightgbm/src/treelearner/*.o
+        rm -f ./lightgbm/src/utils/*.o
 
         echo "re-tarring ${TARBALL_NAME}"
         tar \
diff --git a/docs/.linkcheckerrc b/docs/.linkcheckerrc
index 96fdcbd08157..003d8699a875 100644
--- a/docs/.linkcheckerrc
+++ b/docs/.linkcheckerrc
@@ -11,7 +11,7 @@ ignore=
   http.*amd.com/.*
   https.*dl.acm.org/doi/.*
   https.*tandfonline.com/.*
-ignorewarnings=http-robots-denied,https-certificate-error
+ignorewarnings=http-redirected,http-robots-denied,https-certificate-error
 checkextern=1
 
 [output]
diff --git a/docs/Experiments.rst b/docs/Experiments.rst
index ede19cf3a2eb..4440a2c0ccae 100644
--- a/docs/Experiments.rst
+++ b/docs/Experiments.rst
@@ -18,19 +18,19 @@ Data
 
 We used 5 datasets to conduct our comparison experiments. Details of data are listed in the following table:
 
-+-----------+-----------------------+------------------------------------------------------------------------+-------------+----------+----------------------------------------------+
-| Data      | Task                  | Link                                                                   | #Train\_Set | #Feature | Comments                                     |
-+===========+=======================+========================================================================+=============+==========+==============================================+
-| Higgs     | Binary classification | `link <https://archive.ics.uci.edu/ml/datasets/HIGGS>`__               | 10,500,000  | 28       | last 500,000 samples were used as test set   |
-+-----------+-----------------------+------------------------------------------------------------------------+-------------+----------+----------------------------------------------+
-| Yahoo LTR | Learning to rank      | `link <https://webscope.sandbox.yahoo.com/catalog.php?datatype=c>`__   | 473,134     | 700      | set1.train as train, set1.test as test       |
-+-----------+-----------------------+------------------------------------------------------------------------+-------------+----------+----------------------------------------------+
-| MS LTR    | Learning to rank      | `link <http://research.microsoft.com/en-us/projects/mslr/>`__          | 2,270,296   | 137      | {S1,S2,S3} as train set, {S5} as test set    |
-+-----------+-----------------------+------------------------------------------------------------------------+-------------+----------+----------------------------------------------+
-| Expo      | Binary classification | `link <http://stat-computing.org/dataexpo/2009/>`__                    | 11,000,000  | 700      | last 1,000,000 samples were used as test set |
-+-----------+-----------------------+------------------------------------------------------------------------+-------------+----------+----------------------------------------------+
-| Allstate  | Binary classification | `link <https://www.kaggle.com/c/ClaimPredictionChallenge>`__           | 13,184,290  | 4228     | last 1,000,000 samples were used as test set |
-+-----------+-----------------------+------------------------------------------------------------------------+-------------+----------+----------------------------------------------+
++-----------+-----------------------+---------------------------------------------------------------------------------+-------------+----------+----------------------------------------------+
+| Data      | Task                  | Link                                                                            | #Train\_Set | #Feature | Comments                                     |
++===========+=======================+=================================================================================+=============+==========+==============================================+
+| Higgs     | Binary classification | `link <https://archive.ics.uci.edu/dataset/280/higgs>`__                        | 10,500,000  | 28       | last 500,000 samples were used as test set   |
++-----------+-----------------------+---------------------------------------------------------------------------------+-------------+----------+----------------------------------------------+
+| Yahoo LTR | Learning to rank      | `link <https://webscope.sandbox.yahoo.com/catalog.php?datatype=c>`__            | 473,134     | 700      | set1.train as train, set1.test as test       |
++-----------+-----------------------+---------------------------------------------------------------------------------+-------------+----------+----------------------------------------------+
+| MS LTR    | Learning to rank      | `link <https://www.microsoft.com/en-us/research/project/mslr/>`__               | 2,270,296   | 137      | {S1,S2,S3} as train set, {S5} as test set    |
++-----------+-----------------------+---------------------------------------------------------------------------------+-------------+----------+----------------------------------------------+
+| Expo      | Binary classification | `link <https://community.amstat.org/jointscsg-section/dataexpo/dataexpo2009>`__ | 11,000,000  | 700      | last 1,000,000 samples were used as test set |
++-----------+-----------------------+---------------------------------------------------------------------------------+-------------+----------+----------------------------------------------+
+| Allstate  | Binary classification | `link <https://www.kaggle.com/c/ClaimPredictionChallenge>`__                    | 13,184,290  | 4228     | last 1,000,000 samples were used as test set |
++-----------+-----------------------+---------------------------------------------------------------------------------+-------------+----------+----------------------------------------------+
 
 Environment
 ^^^^^^^^^^^
diff --git a/docs/FAQ.rst b/docs/FAQ.rst
index 3b06761114d5..43999931ca07 100644
--- a/docs/FAQ.rst
+++ b/docs/FAQ.rst
@@ -11,44 +11,15 @@ LightGBM FAQ
 
 ------
 
-Critical Issues
-===============
+Please post questions, feature requests, and bug reports at https://github.com/microsoft/LightGBM/issues.
 
-A **critical issue** could be a *crash*, *prediction error*, *nonsense output*, or something else requiring immediate attention.
+This project is mostly maintained by volunteers, so please be patient.
+If your request is time-sensitive or more than a month goes by without a response, please tag the maintainers below for help.
 
-Please post such an issue in the `Microsoft/LightGBM repository <https://github.com/microsoft/LightGBM/issues>`__.
-
-You may also ping a member of the core team according to the relevant area of expertise by mentioning them with the arabase (@) symbol:
-
--  `@guolinke <https://github.com/guolinke>`__ **Guolin Ke** (C++ code / R-package / Python-package)
--  `@chivee <https://github.com/chivee>`__ **Qiwei Ye** (C++ code / Python-package)
--  `@shiyu1994 <https://github.com/shiyu1994>`__ **Yu Shi** (C++ code / Python-package)
--  `@tongwu-msft` **Tong Wu** (C++ code / Python-package)
--  `@hzy46 <https://github.com/hzy46>`__ **Zhiyuan He** (C++ code / Python-package)
--  `@btrotta <https://github.com/btrotta>`__ **Belinda Trotta** (C++ code)
--  `@Laurae2 <https://github.com/Laurae2>`__ **Damien Soukhavong** (R-package)
--  `@jameslamb <https://github.com/jameslamb>`__ **James Lamb** (R-package / Dask-package)
--  `@jmoralez <https://github.com/jmoralez>`__ **José Morales** (Dask-package)
--  `@wxchan <https://github.com/wxchan>`__ **Wenxuan Chen** (Python-package)
--  `@henry0312 <https://github.com/henry0312>`__ **Tsukasa Omoto** (Python-package)
--  `@StrikerRUS <https://github.com/StrikerRUS>`__ **Nikita Titov** (Python-package)
--  `@huanzhang12 <https://github.com/huanzhang12>`__ **Huan Zhang** (GPU support)
-
-Please include as much of the following information as possible when submitting a critical issue:
-
--  Is it reproducible on CLI (command line interface), R, and/or Python?
-
--  Is it specific to a wrapper? (R or Python?)
-
--  Is it specific to the compiler? (gcc or Clang version? MinGW or Visual Studio version?)
-
--  Is it specific to your Operating System? (Windows? Linux? macOS?)
-
--  Are you able to reproduce this issue with a simple case?
-
--  Does the issue persist after removing all optimization flags and compiling LightGBM in debug mode?
-
-When submitting issues, please keep in mind that this is largely a volunteer effort, and we may not be available 24/7 to provide support.
+-  `@guolinke <https://github.com/guolinke>`__ **Guolin Ke**
+-  `@shiyu1994 <https://github.com/shiyu1994>`__ **Yu Shi**
+-  `@jameslamb <https://github.com/jameslamb>`__ **James Lamb**
+-  `@jmoralez <https://github.com/jmoralez>`__ **José Morales**
 
 --------------
 
@@ -62,7 +33,7 @@ General LightGBM Questions
 1. Where do I find more details about LightGBM parameters?
 ----------------------------------------------------------
 
-Take a look at `Parameters <./Parameters.rst>`__ and the `Laurae++/Parameters <https://sites.google.com/view/lauraepp/parameters>`__ website.
+Take a look at `Parameters <./Parameters.rst>`__.
 
 2. On datasets with millions of features, training does not start (or starts after a very long time).
 -----------------------------------------------------------------------------------------------------
@@ -263,7 +234,7 @@ As of LightGBM v4.0.0, ``setinfo()`` has been replaced by a new method, ``set_fi
 3. ``error in data.table::data.table()...argument 2 is NULL``
 -------------------------------------------------------------
 
-If you are experiencing this error when running ``lightgbm``, you may be facing the same issue reported in `#2715 <https://github.com/microsoft/LightGBM/issues/2715>`_ and later in `#2989 <https://github.com/microsoft/LightGBM/pull/2989#issuecomment-614374151>`_. We have seen that some in some situations, using ``data.table`` 1.11.x results in this error. To get around this, you can upgrade your version of ``data.table`` to at least version 1.12.0.
+If you are experiencing this error when running ``lightgbm``, you may be facing the same issue reported in `#2715 <https://github.com/microsoft/LightGBM/issues/2715>`_ and later in `#2989 <https://github.com/microsoft/LightGBM/pull/2989#issuecomment-614374151>`_. We have seen that in some situations, using ``data.table`` 1.11.x results in this error. To get around this, you can upgrade your version of ``data.table`` to at least version 1.12.0.
 
 ------
 
@@ -289,7 +260,7 @@ Python-package
 
 This error should be solved in latest version.
 If you still meet this error, try to remove ``lightgbm.egg-info`` folder in your Python-package and reinstall,
-or check `this thread on stackoverflow <http://stackoverflow.com/questions/18085571/pip-install-error-setup-script-specifies-an-absolute-path>`__.
+or check `this thread on stackoverflow <https://stackoverflow.com/questions/18085571/pip-install-error-setup-script-specifies-an-absolute-path>`__.
 
 2. Error messages: ``Cannot ... before construct dataset``.
 -----------------------------------------------------------
diff --git a/docs/Features.rst b/docs/Features.rst
index a7db86ec2935..89b56646588f 100644
--- a/docs/Features.rst
+++ b/docs/Features.rst
@@ -291,7 +291,7 @@ References
 
 .. _On Grouping for Maximum Homogeneity: https://www.tandfonline.com/doi/abs/10.1080/01621459.1958.10501479
 
-.. _Optimization of collective communication operations in MPICH: https://www.mcs.anl.gov/~thakur/papers/ijhpca-coll.pdf
+.. _Optimization of collective communication operations in MPICH: https://web.cels.anl.gov/~thakur/papers/ijhpca-coll.pdf
 
 .. _A Communication-Efficient Parallel Algorithm for Decision Tree: http://papers.nips.cc/paper/6381-a-communication-efficient-parallel-algorithm-for-decision-tree
 
diff --git a/docs/GPU-Performance.rst b/docs/GPU-Performance.rst
index ab7ff4137cf8..64cd78eb4202 100644
--- a/docs/GPU-Performance.rst
+++ b/docs/GPU-Performance.rst
@@ -194,17 +194,17 @@ following article:
 
 Huan Zhang, Si Si and Cho-Jui Hsieh. `GPU Acceleration for Large-scale Tree Boosting`_. SysML Conference, 2018.
 
-.. _link1: https://archive.ics.uci.edu/ml/datasets/HIGGS
+.. _link1: https://archive.ics.uci.edu/dataset/280/higgs
 
-.. _link2: http://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html
+.. _link2: https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/binary.html
 
 .. _link3: https://www.kaggle.com/c/bosch-production-line-performance/data
 
 .. _link4: https://webscope.sandbox.yahoo.com/catalog.php?datatype=c
 
-.. _link5: http://research.microsoft.com/en-us/projects/mslr/
+.. _link5: https://www.microsoft.com/en-us/research/project/mslr/
 
-.. _link6: http://stat-computing.org/dataexpo/2009/
+.. _link6: https://community.amstat.org/jointscsg-section/dataexpo/dataexpo2009
 
 .. _0bb4a82: https://github.com/microsoft/LightGBM/commit/0bb4a82
 
diff --git a/docs/GPU-Targets.rst b/docs/GPU-Targets.rst
index 9c3cac7c814a..ab024847d82d 100644
--- a/docs/GPU-Targets.rst
+++ b/docs/GPU-Targets.rst
@@ -4,7 +4,7 @@ GPU SDK Correspondence and Device Targeting Table
 GPU Targets Table
 =================
 
-OpenCL is a universal massively parallel programming framework that targets to multiple backends (GPU, CPU, FPGA, etc).
+OpenCL is a universal massively parallel programming framework that targets multiple backends (GPU, CPU, FPGA, etc).
 Basically, to use a device from a vendor, you have to install drivers from that specific vendor.
 Intel's and AMD's OpenCL runtime also include x86 CPU target support.
 NVIDIA's OpenCL runtime only supports NVIDIA GPU (no CPU support).
diff --git a/docs/GPU-Tutorial.rst b/docs/GPU-Tutorial.rst
index 836ab1add378..ee1d3173e556 100644
--- a/docs/GPU-Tutorial.rst
+++ b/docs/GPU-Tutorial.rst
@@ -33,7 +33,7 @@ After installing the drivers you need to restart the server.
 
 After about 30 seconds, the server should be up again.
 
-If you are using an AMD GPU, you should download and install the `AMDGPU-Pro`_ driver and also install package ``ocl-icd-libopencl1`` and ``ocl-icd-opencl-dev``.
+If you are using an AMD GPU, you should download and install the `AMDGPU-Pro`_ driver and also install packages ``ocl-icd-libopencl1`` and ``ocl-icd-opencl-dev``.
 
 Build LightGBM
 --------------
diff --git a/docs/GPU-Windows.rst b/docs/GPU-Windows.rst
index 36e657e5801b..c4c2ca818433 100644
--- a/docs/GPU-Windows.rst
+++ b/docs/GPU-Windows.rst
@@ -152,7 +152,7 @@ Download  `Prebuilt Boost x86_64`_ or `Prebuilt Boost i686`_ and unpack them wit
 Boost Compilation
 -----------------
 
-Installing Boost requires to download Boost and to install it.
+Installing Boost requires downloading Boost and installing it.
 It takes about 10 minutes to several hours depending on your CPU speed and network speed.
 
 We will assume an installation in ``C:\boost`` and a general installation (like in Unix variants: without versioning and without type tags).
@@ -279,7 +279,7 @@ Installing CMake requires one download first and then a lot of configuration for
    :target: ./_static/images/screenshot-downloading-cmake.png
    :alt: A screenshot of the binary distributions of C Make for downloading on 64 bit Windows.
 
--  Download `CMake`_ (3.8 or higher)
+-  Download `CMake`_
 
 -  Install CMake
 
diff --git a/docs/Installation-Guide.rst b/docs/Installation-Guide.rst
index 412f0ec5993a..62383d9f924a 100644
--- a/docs/Installation-Guide.rst
+++ b/docs/Installation-Guide.rst
@@ -69,7 +69,7 @@ The ``.exe`` file will be in ``LightGBM-master/windows/x64/Release`` folder.
 From Command Line
 *****************
 
-1. Install `Git for Windows`_, `CMake`_ (3.8 or higher) and `VS Build Tools`_ (**VS Build Tools** is not needed if **Visual Studio** (2015 or newer) is already installed).
+1. Install `Git for Windows`_, `CMake`_ and `VS Build Tools`_ (**VS Build Tools** is not needed if **Visual Studio** (2015 or newer) is already installed).
 
 2. Run the following commands:
 
@@ -167,7 +167,7 @@ Install Using ``Homebrew``
 Build from GitHub
 *****************
 
-1. Install `CMake`_ (3.16 or higher):
+1. Install `CMake`_ :
 
    .. code:: sh
 
@@ -193,7 +193,7 @@ Build from GitHub
 gcc
 ^^^
 
-1. Install `CMake`_ (3.2 or higher):
+1. Install `CMake`_ :
 
    .. code:: sh
 
@@ -266,7 +266,7 @@ The ``.exe`` file will be in ``LightGBM-master/windows/x64/Release`` folder.
 From Command Line
 -----------------
 
-1. Install `Git for Windows`_, `CMake`_ (3.8 or higher) and `VS Build Tools`_ (**VS Build Tools** is not needed if **Visual Studio** (2015 or newer) is already installed).
+1. Install `Git for Windows`_, `CMake`_ and `VS Build Tools`_ (**VS Build Tools** is not needed if **Visual Studio** (2015 or newer) is already installed).
 
 2. Run the following commands:
 
@@ -331,7 +331,7 @@ Apple Clang
 
 Only **Apple Clang** version 8.1 or higher is supported.
 
-1. Install `CMake`_ (3.16 or higher):
+1. Install `CMake`_ :
 
    .. code:: sh
 
@@ -351,7 +351,7 @@ Only **Apple Clang** version 8.1 or higher is supported.
 gcc
 ***
 
-1. Install `CMake`_ (3.2 or higher):
+1. Install `CMake`_ :
 
    .. code:: sh
 
@@ -414,7 +414,7 @@ From Command Line
 
 1. You need to install `MS MPI`_ first. Both ``msmpisdk.msi`` and ``msmpisetup.exe`` are needed.
 
-2. Install `Git for Windows`_, `CMake`_ (3.8 or higher) and `VS Build Tools`_ (**VS Build Tools** is not needed if **Visual Studio** (2015 or newer) is already installed).
+2. Install `Git for Windows`_, `CMake`_ and `VS Build Tools`_ (**VS Build Tools** is not needed if **Visual Studio** (2015 or newer) is already installed).
 
 3. Run the following commands:
 
@@ -465,7 +465,7 @@ Apple Clang
 
 Only **Apple Clang** version 8.1 or higher is supported.
 
-1. Install `CMake`_ (3.16 or higher):
+1. Install `CMake`_ :
 
    .. code:: sh
 
@@ -497,7 +497,7 @@ Only **Apple Clang** version 8.1 or higher is supported.
 gcc
 ***
 
-1. Install `CMake`_ (3.2 or higher):
+1. Install `CMake`_ :
 
    .. code:: sh
 
@@ -547,7 +547,7 @@ The following dependencies should be installed before compilation:
 
    The following Debian packages should provide necessary Boost libraries: ``libboost-dev``, ``libboost-system-dev``, ``libboost-filesystem-dev``.
 
--  **CMake** 3.2 or later.
+-  **CMake**
 
 To build LightGBM GPU version, run the following commands:
 
@@ -575,7 +575,7 @@ If you use **MinGW**, the build procedure is similar to the build on Linux. Refe
 
 Following procedure is for the **MSVC** (Microsoft Visual C++) build.
 
-1. Install `Git for Windows`_, `CMake`_ (3.8 or higher) and `VS Build Tools`_ (**VS Build Tools** is not needed if **Visual Studio** (2015 or newer) is installed).
+1. Install `Git for Windows`_, `CMake`_ and `VS Build Tools`_ (**VS Build Tools** is not needed if **Visual Studio** (2015 or newer) is installed).
 
 2. Install **OpenCL** for Windows. The installation depends on the brand (NVIDIA, AMD, Intel) of your GPU card.
 
@@ -635,9 +635,9 @@ On Linux a CUDA version of LightGBM can be built using **CUDA**, **CMake** and *
 
 The following dependencies should be installed before compilation:
 
--  **CUDA** 10.0 or later libraries. Please refer to `this detailed guide`_. Pay great attention to the minimum required versions of host compilers listed in the table from that guide and use only recommended versions of compilers.
+-  **CUDA** 11.0 or later libraries. Please refer to `this detailed guide`_. Pay great attention to the minimum required versions of host compilers listed in the table from that guide and use only recommended versions of compilers.
 
--  **CMake** 3.16 or later.
+-  **CMake**
 
 To build LightGBM CUDA version, run the following commands:
 
@@ -700,7 +700,7 @@ On Windows a Java wrapper of LightGBM can be built using **Java**, **SWIG**, **C
 VS Build Tools
 **************
 
-1. Install `Git for Windows`_, `CMake`_ (3.8 or higher) and `VS Build Tools`_ (**VS Build Tools** is not needed if **Visual Studio** (2015 or newer) is already installed).
+1. Install `Git for Windows`_, `CMake`_ and `VS Build Tools`_ (**VS Build Tools** is not needed if **Visual Studio** (2015 or newer) is already installed).
 
 2. Install `SWIG`_ and **Java** (also make sure that ``JAVA_HOME`` is set properly).
 
@@ -779,7 +779,7 @@ Apple Clang
 
 Only **Apple Clang** version 8.1 or higher is supported.
 
-1. Install `CMake`_ (3.16 or higher):
+1. Install `CMake`_ :
 
    .. code:: sh
 
@@ -805,7 +805,7 @@ Only **Apple Clang** version 8.1 or higher is supported.
 gcc
 ***
 
-1. Install `CMake`_ (3.2 or higher):
+1. Install `CMake`_ :
 
    .. code:: sh
 
@@ -839,7 +839,7 @@ Windows
 
 On Windows, C++ unit tests of LightGBM can be built using **CMake** and **VS Build Tools**.
 
-1. Install `Git for Windows`_, `CMake`_ (3.8 or higher) and `VS Build Tools`_ (**VS Build Tools** is not needed if **Visual Studio** (2015 or newer) is already installed).
+1. Install `Git for Windows`_, `CMake`_ and `VS Build Tools`_ (**VS Build Tools** is not needed if **Visual Studio** (2015 or newer) is already installed).
 
 2. Run the following commands:
 
@@ -884,7 +884,7 @@ Apple Clang
 
 Only **Apple Clang** version 8.1 or higher is supported.
 
-1. Install `CMake`_ (3.16 or higher):
+1. Install `CMake`_ :
 
    .. code:: sh
 
@@ -904,7 +904,7 @@ Only **Apple Clang** version 8.1 or higher is supported.
 gcc
 ***
 
-1. Install `CMake`_ (3.2 or higher):
+1. Install `CMake`_ :
 
    .. code:: sh
 
@@ -950,7 +950,7 @@ gcc
 
 .. _RDMA: https://en.wikipedia.org/wiki/Remote_direct_memory_access
 
-.. _MS MPI: https://docs.microsoft.com/en-us/message-passing-interface/microsoft-mpi-release-notes
+.. _MS MPI: https://learn.microsoft.com/en-us/message-passing-interface/microsoft-mpi-release-notes
 
 .. _Open MPI: https://www.open-mpi.org/
 
@@ -960,7 +960,7 @@ gcc
 
 .. _Boost Binaries: https://sourceforge.net/projects/boost/files/boost-binaries/
 
-.. _SWIG: http://www.swig.org/download.html
+.. _SWIG: https://www.swig.org/download.html
 
 .. _this detailed guide: https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html
 
diff --git a/docs/Parallel-Learning-Guide.rst b/docs/Parallel-Learning-Guide.rst
index e1857034e499..cbc7b1012b98 100644
--- a/docs/Parallel-Learning-Guide.rst
+++ b/docs/Parallel-Learning-Guide.rst
@@ -384,8 +384,6 @@ From the point forward, you can use any of the following methods to save the Boo
 Kubeflow
 ^^^^^^^^
 
-`Kubeflow Fairing`_ supports LightGBM distributed training. `These examples`_ show how to get started with LightGBM and Kubeflow Fairing in a hybrid cloud environment.
-
 Kubeflow users can also use the `Kubeflow XGBoost Operator`_ for machine learning workflows with LightGBM. You can see `this example`_ for more details.
 
 Kubeflow integrations for LightGBM are not maintained by LightGBM's maintainers.
@@ -520,7 +518,7 @@ See `the mars documentation`_ for usage examples.
 
 .. _the Dask DataFrame documentation: https://docs.dask.org/en/latest/dataframe.html
 
-.. _the Dask prediction example: https://github.com/microsoft/lightgbm/tree/master/examples/python-guide/dask/prediction.py
+.. _the Dask prediction example: https://github.com/microsoft/LightGBM/blob/master/examples/python-guide/dask/prediction.py
 
 .. _the Dask worker documentation: https://distributed.dask.org/en/stable/worker-memory.html
 
@@ -528,10 +526,6 @@ See `the mars documentation`_ for usage examples.
 
 .. _these Dask examples: https://github.com/microsoft/lightgbm/tree/master/examples/python-guide/dask
 
-.. _Kubeflow Fairing: https://www.kubeflow.org/docs/components/fairing/fairing-overview
-
-.. _These examples: https://github.com/kubeflow/fairing/tree/master/examples/lightgbm
-
 .. _Kubeflow XGBoost Operator: https://github.com/kubeflow/xgboost-operator
 
 .. _this example: https://github.com/kubeflow/xgboost-operator/tree/master/config/samples/lightgbm-dist
@@ -542,7 +536,7 @@ See `the mars documentation`_ for usage examples.
 
 .. _lightgbm_ray: https://github.com/ray-project/lightgbm_ray
 
-.. _Ray: https://ray.io/
+.. _Ray: https://www.ray.io/
 
 .. _the lightgbm_ray documentation: https://docs.ray.io/en/latest/tune/api_docs/integration.html#lightgbm-tune-integration-lightgbm
 
diff --git a/docs/Parameters.rst b/docs/Parameters.rst
index 86104ba5be55..341cdd487c71 100644
--- a/docs/Parameters.rst
+++ b/docs/Parameters.rst
@@ -14,10 +14,6 @@ This page contains descriptions of all parameters in LightGBM.
 
 - `Parameters Tuning <./Parameters-Tuning.rst>`__
 
-**External Links**
-
-- `Laurae++ Interactive Documentation`_
-
 Parameters Format
 -----------------
 
@@ -119,7 +115,7 @@ Core Parameters
 
    -  ranking application
 
-      -  ``lambdarank``, `lambdarank <https://papers.nips.cc/paper/2971-learning-to-rank-with-nonsmooth-cost-functions.pdf>`__ objective. `label_gain <#label_gain>`__ can be used to set the gain (weight) of ``int`` label and all values in ``label`` must be smaller than number of elements in ``label_gain``
+      -  ``lambdarank``, `lambdarank <https://proceedings.neurips.cc/paper_files/paper/2006/file/af44c4c56f385c43f2529f9b1b018f6a-Paper.pdf>`__ objective. `label_gain <#label_gain>`__ can be used to set the gain (weight) of ``int`` label and all values in ``label`` must be smaller than number of elements in ``label_gain``
 
       -  ``rank_xendcg``, `XE_NDCG_MART <https://arxiv.org/abs/1911.09798>`__ ranking objective function, aliases: ``xendcg``, ``xe_ndcg``, ``xe_ndcg_mart``, ``xendcg_mart``
 
@@ -536,15 +532,15 @@ Learning Control Parameters
 
       -  ``basic``, the most basic monotone constraints method. It does not slow the library at all, but over-constrains the predictions
 
-      -  ``intermediate``, a `more advanced method <https://hal.archives-ouvertes.fr/hal-02862802/document>`__, which may slow the library very slightly. However, this method is much less constraining than the basic method and should significantly improve the results
+      -  ``intermediate``, a `more advanced method <https://hal.science/hal-02862802/document>`__, which may slow the library very slightly. However, this method is much less constraining than the basic method and should significantly improve the results
 
-      -  ``advanced``, an `even more advanced method <https://hal.archives-ouvertes.fr/hal-02862802/document>`__, which may slow the library. However, this method is even less constraining than the intermediate method and should again significantly improve the results
+      -  ``advanced``, an `even more advanced method <https://hal.science/hal-02862802/document>`__, which may slow the library. However, this method is even less constraining than the intermediate method and should again significantly improve the results
 
 -  ``monotone_penalty`` :raw-html:`<a id="monotone_penalty" title="Permalink to this parameter" href="#monotone_penalty">&#x1F517;&#xFE0E;</a>`, default = ``0.0``, type = double, aliases: ``monotone_splits_penalty``, ``ms_penalty``, ``mc_penalty``, constraints: ``monotone_penalty >= 0.0``
 
    -  used only if ``monotone_constraints`` is set
 
-   -  `monotone penalty <https://hal.archives-ouvertes.fr/hal-02862802/document>`__: a penalization parameter X forbids any monotone splits on the first X (rounded down) level(s) of the tree. The penalty applied to monotone splits on a given depth is a continuous, increasing function the penalization parameter
+   -  `monotone penalty <https://hal.science/hal-02862802/document>`__: a penalization parameter X forbids any monotone splits on the first X (rounded down) level(s) of the tree. The penalty applied to monotone splits on a given depth is a continuous, increasing function the penalization parameter
 
    -  if ``0.0`` (the default), no penalization is applied
 
@@ -564,7 +560,7 @@ Learning Control Parameters
 
    -  **Note**: the forced split logic will be ignored, if the split makes gain worse
 
-   -  see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/binary_classification/forced_splits.json>`__ as an example
+   -  see `this file <https://github.com/microsoft/LightGBM/blob/master/examples/binary_classification/forced_splits.json>`__ as an example
 
 -  ``refit_decay_rate`` :raw-html:`<a id="refit_decay_rate" title="Permalink to this parameter" href="#refit_decay_rate">&#x1F517;&#xFE0E;</a>`, default = ``0.9``, type = double, constraints: ``0.0 <= refit_decay_rate <= 1.0``
 
@@ -770,7 +766,7 @@ Dataset Parameters
 
 -  ``enable_bundle`` :raw-html:`<a id="enable_bundle" title="Permalink to this parameter" href="#enable_bundle">&#x1F517;&#xFE0E;</a>`, default = ``true``, type = bool, aliases: ``is_enable_bundle``, ``bundle``
 
-   -  set this to ``false`` to disable Exclusive Feature Bundling (EFB), which is described in `LightGBM: A Highly Efficient Gradient Boosting Decision Tree <https://papers.nips.cc/paper/6907-lightgbm-a-highly-efficient-gradient-boosting-decision-tree>`__
+   -  set this to ``false`` to disable Exclusive Feature Bundling (EFB), which is described in `LightGBM: A Highly Efficient Gradient Boosting Decision Tree <https://papers.nips.cc/paper_files/paper/2017/hash/6449f44a102fde848669bdd9eb6b76fa-Abstract.html>`__
 
    -  **Note**: disabling this may cause the slow training speed for sparse datasets
 
@@ -894,7 +890,7 @@ Dataset Parameters
 
    -  ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bound`` (array of thresholds for binning)
 
-   -  see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/regression/forced_bins.json>`__ as an example
+   -  see `this file <https://github.com/microsoft/LightGBM/blob/master/examples/regression/forced_bins.json>`__ as an example
 
 -  ``save_binary`` :raw-html:`<a id="save_binary" title="Permalink to this parameter" href="#save_binary">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool, aliases: ``is_save_binary``, ``is_save_binary_file``
 
@@ -961,7 +957,7 @@ Predict Parameters
 
    -  produces ``#features + 1`` values where the last value is the expected value of the model output over the training data
 
-   -  **Note**: if you want to get more explanation for your model's predictions using SHAP values like SHAP interaction values, you can install `shap package <https://github.com/slundberg/shap>`__
+   -  **Note**: if you want to get more explanation for your model's predictions using SHAP values like SHAP interaction values, you can install `shap package <https://github.com/shap>`__
 
    -  **Note**: unlike the shap package, with ``predict_contrib`` we return a matrix with an extra column, where the last column is the expected value
 
@@ -1380,5 +1376,3 @@ If the name of data file is ``train.txt``, the query file should be named as ``t
 In this case, LightGBM will load the query file automatically if it exists.
 
 Also, you can include query/group id column in your data file. Please refer to the ``group_column`` `parameter <#group_column>`__ in above.
-
-.. _Laurae++ Interactive Documentation: https://sites.google.com/view/lauraepp/parameters
diff --git a/docs/Quick-Start.rst b/docs/Quick-Start.rst
index 04e64beb1281..4a372db6736e 100644
--- a/docs/Quick-Start.rst
+++ b/docs/Quick-Start.rst
@@ -50,7 +50,7 @@ The parameters format is ``key1=value1 key2=value2 ...``.
 Parameters can be set both in config file and command line.
 If one parameter appears in both command line and config file, LightGBM will use the parameter from the command line.
 
-The most important parameters which new users should take a look to are located into `Core Parameters <./Parameters.rst#core-parameters>`__
+The most important parameters which new users should take a look at are located into `Core Parameters <./Parameters.rst#core-parameters>`__
 and the top of `Learning Control Parameters <./Parameters.rst#learning-control-parameters>`__
 sections of the full detailed list of `LightGBM's parameters <./Parameters.rst>`__.
 
@@ -85,4 +85,4 @@ Examples
 
 .. _LibSVM: https://www.csie.ntu.edu.tw/~cjlin/libsvm/
 
-.. _Expo data: http://stat-computing.org/dataexpo/2009/
+.. _Expo data: https://community.amstat.org/jointscsg-section/dataexpo/dataexpo2009
diff --git a/docs/env.yml b/docs/env.yml
index 69bcd92fbdc3..351ce1f0fae0 100644
--- a/docs/env.yml
+++ b/docs/env.yml
@@ -9,9 +9,9 @@ dependencies:
   - r-data.table=1.14.2
   - r-jsonlite=1.7.2
   - r-knitr=1.37
+  - r-markdown
   - r-matrix=1.4_0
   - r-pkgdown=1.6.1
-  - r-rmarkdown=2.11
   - r-roxygen2=7.2.1
   - scikit-learn
   - sphinx
diff --git a/docs/gcc-Tips.rst b/docs/gcc-Tips.rst
index ad5981855d20..938aee407f7d 100644
--- a/docs/gcc-Tips.rst
+++ b/docs/gcc-Tips.rst
@@ -25,8 +25,6 @@ You can find more details on the experimentation below:
 
 -  `Laurae's Benchmark Master Data (Interactive) <https://public.tableau.com/views/gbt_benchmarks/Master-Data?:showVizHome=no>`__
 
--  `Kaggle Paris Meetup #12 Slides <https://drive.google.com/file/d/0B6qJBmoIxFe0ZHNCOXdoRWMxUm8/view>`__
-
 The image below compares the runtime for training with different compiler options to a baseline using LightGBM compiled with ``-O2 --mtune=core2``. All three options are faster than that baseline. The best performance was achieved with ``-O3 --mtune=native``.
 
 .. image:: ./_static/images/gcc-comparison-2.png
diff --git a/include/LightGBM/arrow.h b/include/LightGBM/arrow.h
new file mode 100644
index 000000000000..767da12a9809
--- /dev/null
+++ b/include/LightGBM/arrow.h
@@ -0,0 +1,259 @@
+/*!
+ * Copyright (c) 2023 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for license information.
+ *
+ * Author: Oliver Borchert
+ */
+
+#ifndef LIGHTGBM_ARROW_H_
+#define LIGHTGBM_ARROW_H_
+
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <utility>
+#include <vector>
+#include <stdexcept>
+
+/* -------------------------------------- C DATA INTERFACE ------------------------------------- */
+// The C data interface is taken from
+// https://arrow.apache.org/docs/format/CDataInterface.html#structure-definitions
+// and is available under Apache License 2.0 (https://www.apache.org/licenses/LICENSE-2.0).
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ARROW_FLAG_DICTIONARY_ORDERED 1
+#define ARROW_FLAG_NULLABLE 2
+#define ARROW_FLAG_MAP_KEYS_SORTED 4
+
+struct ArrowSchema {
+  // Array type description
+  const char* format;
+  const char* name;
+  const char* metadata;
+  int64_t flags;
+  int64_t n_children;
+  struct ArrowSchema** children;
+  struct ArrowSchema* dictionary;
+
+  // Release callback
+  void (*release)(struct ArrowSchema*);
+  // Opaque producer-specific data
+  void* private_data;
+};
+
+struct ArrowArray {
+  // Array data description
+  int64_t length;
+  int64_t null_count;
+  int64_t offset;
+  int64_t n_buffers;
+  int64_t n_children;
+  const void** buffers;
+  struct ArrowArray** children;
+  struct ArrowArray* dictionary;
+
+  // Release callback
+  void (*release)(struct ArrowArray*);
+  // Opaque producer-specific data
+  void* private_data;
+};
+
+#ifdef __cplusplus
+}
+#endif
+
+/* --------------------------------------------------------------------------------------------- */
+/*                                         CHUNKED ARRAY                                         */
+/* --------------------------------------------------------------------------------------------- */
+
+namespace LightGBM {
+
+/**
+ * @brief Arrow array-like container for a list of Arrow arrays.
+ */
+class ArrowChunkedArray {
+  /* List of length `n` for `n` chunks containing the individual Arrow arrays. */
+  std::vector<const ArrowArray*> chunks_;
+  /* Schema for all chunks. */
+  const ArrowSchema* schema_;
+  /* List of length `n + 1` for `n` chunks containing the offsets for each chunk. */
+  std::vector<int64_t> chunk_offsets_;
+
+  inline void construct_chunk_offsets() {
+    chunk_offsets_.reserve(chunks_.size() + 1);
+    chunk_offsets_.emplace_back(0);
+    for (size_t k = 0; k < chunks_.size(); ++k) {
+      chunk_offsets_.emplace_back(chunks_[k]->length + chunk_offsets_.back());
+    }
+  }
+
+ public:
+  /**
+   * @brief Construct a new Arrow Chunked Array object.
+   *
+   * @param chunks A list with the chunks.
+   * @param schema The schema for all chunks.
+   */
+  inline ArrowChunkedArray(std::vector<const ArrowArray*> chunks, const ArrowSchema* schema) {
+    chunks_ = chunks;
+    schema_ = schema;
+    construct_chunk_offsets();
+  }
+
+  /**
+   * @brief Construct a new Arrow Chunked Array object.
+   *
+   * @param n_chunks The number of chunks.
+   * @param chunks A C-style array containing the chunks.
+   * @param schema The schema for all chunks.
+   */
+  inline ArrowChunkedArray(int64_t n_chunks,
+                           const struct ArrowArray* chunks,
+                           const struct ArrowSchema* schema) {
+    chunks_.reserve(n_chunks);
+    for (auto k = 0; k < n_chunks; ++k) {
+      if (chunks[k].length == 0) continue;
+      chunks_.push_back(&chunks[k]);
+    }
+    schema_ = schema;
+    construct_chunk_offsets();
+  }
+
+  /**
+   * @brief Get the length of the chunked array.
+   * This method returns the cumulative length of all chunks.
+   * Complexity: O(1)
+   *
+   * @return int64_t The number of elements in the chunked array.
+   */
+  inline int64_t get_length() const { return chunk_offsets_.back(); }
+
+  /* ----------------------------------------- ITERATOR ---------------------------------------- */
+  template <typename T>
+  class Iterator {
+    using getter_fn = std::function<T(const ArrowArray*, int64_t)>;
+
+    /* Reference to the chunked array that this iterator iterates over. */
+    const ArrowChunkedArray& array_;
+    /* Function to fetch the value at a certain index from a single chunk. */
+    getter_fn get_;
+    /* The chunk the iterator currently points to. */
+    int64_t ptr_chunk_;
+    /* The index inside the current chunk that the iterator points to. */
+    int64_t ptr_offset_;
+
+   public:
+    using iterator_category = std::random_access_iterator_tag;
+    using difference_type = int64_t;
+    using value_type = T;
+    using pointer = value_type*;
+    using reference = value_type&;
+
+    /**
+     * @brief Construct a new Iterator object.
+     *
+     * @param array Reference to the chunked array to iterator over.
+     * @param get Function to fetch the value at a certain index from a single chunk.
+     * @param ptr_chunk The index of the chunk to whose first index the iterator points to.
+     */
+    Iterator(const ArrowChunkedArray& array, getter_fn get, int64_t ptr_chunk);
+
+    T operator*() const;
+    template <typename I>
+    T operator[](I idx) const;
+
+    Iterator<T>& operator++();
+    Iterator<T>& operator--();
+    Iterator<T>& operator+=(int64_t c);
+
+    template <typename V>
+    friend bool operator==(const Iterator<V>& a, const Iterator<V>& b);
+    template <typename V>
+    friend bool operator!=(const Iterator<V>& a, const Iterator<V>& b);
+    template <typename V>
+    friend int64_t operator-(const Iterator<V>& a, const Iterator<V>& b);
+  };
+
+  /**
+   * @brief Obtain an iterator to the beginning of the chunked array.
+   *
+   * @tparam T The value type of the iterator. May be any primitive type.
+   * @return Iterator<T> The iterator.
+   */
+  template <typename T>
+  inline Iterator<T> begin() const;
+
+  /**
+   * @brief Obtain an iterator to the beginning of the chunked array.
+   *
+   * @tparam T The value type of the iterator. May be any primitive type.
+   * @return Iterator<T> The iterator.
+   */
+  template <typename T>
+  inline Iterator<T> end() const;
+
+  template <typename V>
+  friend int64_t operator-(const Iterator<V>& a, const Iterator<V>& b);
+};
+
+/**
+ * @brief Arrow container for a list of chunked arrays.
+ */
+class ArrowTable {
+  std::vector<ArrowChunkedArray> columns_;
+
+ public:
+  /**
+   * @brief Construct a new Arrow Table object.
+   *
+   * @param n_chunks The number of chunks.
+   * @param chunks A C-style array containing the chunks.
+   * @param schema The schema for all chunks.
+   */
+  inline ArrowTable(int64_t n_chunks, const ArrowArray* chunks, const ArrowSchema* schema) {
+    columns_.reserve(schema->n_children);
+    for (int64_t j = 0; j < schema->n_children; ++j) {
+      std::vector<const ArrowArray*> children_chunks;
+      children_chunks.reserve(n_chunks);
+      for (int64_t k = 0; k < n_chunks; ++k) {
+        if (chunks[k].length == 0) continue;
+        children_chunks.push_back(chunks[k].children[j]);
+      }
+      columns_.emplace_back(children_chunks, schema->children[j]);
+    }
+  }
+
+  /**
+   * @brief Get the number of rows in the table.
+   *
+   * @return int64_t The number of rows.
+   */
+  inline int64_t get_num_rows() const { return columns_.front().get_length(); }
+
+  /**
+   * @brief Get the number of columns of this table.
+   *
+   * @return int64_t The column count.
+   */
+  inline int64_t get_num_columns() const { return columns_.size(); }
+
+  /**
+   * @brief Get the column at a particular index.
+   *
+   * @param idx The index of the column, must me in the range `[0, num_columns)`.
+   * @return const ArrowChunkedArray& The chunked array for the child at the provided index.
+   */
+  inline const ArrowChunkedArray& get_column(size_t idx) const { return this->columns_[idx]; }
+};
+
+}  // namespace LightGBM
+
+#include "arrow.tpp"
+
+#endif /* LIGHTGBM_ARROW_H_ */
diff --git a/include/LightGBM/arrow.tpp b/include/LightGBM/arrow.tpp
new file mode 100644
index 000000000000..8d1ce4f4c0c1
--- /dev/null
+++ b/include/LightGBM/arrow.tpp
@@ -0,0 +1,190 @@
+#include <LightGBM/arrow.h>
+
+#ifndef ARROW_TPP_
+#define ARROW_TPP_
+
+namespace LightGBM {
+
+/**
+ * @brief Obtain a function to access an index from an Arrow array.
+ *
+ * @tparam T The return type of the function, must be a primitive type.
+ * @param dtype The Arrow format string describing the datatype of the Arrow array.
+ * @return std::function<T(const ArrowArray*, size_t)> The index accessor function.
+ */
+template <typename T>
+std::function<T(const ArrowArray*, size_t)> get_index_accessor(const char* dtype);
+
+/* ---------------------------------- ITERATOR INITIALIZATION ---------------------------------- */
+
+template <typename T>
+inline ArrowChunkedArray::Iterator<T> ArrowChunkedArray::begin() const {
+  return ArrowChunkedArray::Iterator<T>(*this, get_index_accessor<T>(schema_->format), 0);
+}
+
+template <typename T>
+inline ArrowChunkedArray::Iterator<T> ArrowChunkedArray::end() const {
+  return ArrowChunkedArray::Iterator<T>(*this, get_index_accessor<T>(schema_->format),
+                                        chunk_offsets_.size() - 1);
+}
+
+/* ---------------------------------- ITERATOR IMPLEMENTATION ---------------------------------- */
+
+template <typename T>
+ArrowChunkedArray::Iterator<T>::Iterator(const ArrowChunkedArray& array,
+                                         getter_fn get,
+                                         int64_t ptr_chunk)
+    : array_(array), get_(get), ptr_chunk_(ptr_chunk) {
+  this->ptr_offset_ = 0;
+}
+
+template <typename T>
+T ArrowChunkedArray::Iterator<T>::operator*() const {
+  auto chunk = array_.chunks_[ptr_chunk_];
+  return static_cast<T>(get_(chunk, ptr_offset_));
+}
+
+template <typename T>
+template <typename I>
+T ArrowChunkedArray::Iterator<T>::operator[](I idx) const {
+  auto it = std::lower_bound(array_.chunk_offsets_.begin(), array_.chunk_offsets_.end(), idx,
+                             [](int64_t a, int64_t b) { return a <= b; });
+
+  auto chunk_idx = std::distance(array_.chunk_offsets_.begin() + 1, it);
+  auto chunk = array_.chunks_[chunk_idx];
+
+  auto ptr_offset = static_cast<int64_t>(idx) - array_.chunk_offsets_[chunk_idx];
+  return static_cast<T>(get_(chunk, ptr_offset));
+}
+
+template <typename T>
+ArrowChunkedArray::Iterator<T>& ArrowChunkedArray::Iterator<T>::operator++() {
+  if (ptr_offset_ + 1 >= array_.chunks_[ptr_chunk_]->length) {
+    ptr_offset_ = 0;
+    ptr_chunk_++;
+  } else {
+    ptr_offset_++;
+  }
+  return *this;
+}
+
+template <typename T>
+ArrowChunkedArray::Iterator<T>& ArrowChunkedArray::Iterator<T>::operator--() {
+  if (ptr_offset_ == 0) {
+    ptr_chunk_--;
+    ptr_offset_ = array_.chunks_[ptr_chunk_]->length - 1;
+  } else {
+    ptr_chunk_--;
+  }
+  return *this;
+}
+
+template <typename T>
+ArrowChunkedArray::Iterator<T>& ArrowChunkedArray::Iterator<T>::operator+=(int64_t c) {
+  while (ptr_offset_ + c >= array_.chunks_[ptr_chunk_]->length) {
+    c -= array_.chunks_[ptr_chunk_]->length - ptr_offset_;
+    ptr_offset_ = 0;
+    ptr_chunk_++;
+  }
+  ptr_offset_ += c;
+  return *this;
+}
+
+template <typename T>
+bool operator==(const ArrowChunkedArray::Iterator<T>& a, const ArrowChunkedArray::Iterator<T>& b) {
+  return a.ptr_chunk_ == b.ptr_chunk_ && a.ptr_offset_ == b.ptr_offset_;
+}
+
+template <typename T>
+bool operator!=(const ArrowChunkedArray::Iterator<T>& a, const ArrowChunkedArray::Iterator<T>& b) {
+  return a.ptr_chunk_ != b.ptr_chunk_ || a.ptr_offset_ != b.ptr_offset_;
+}
+
+template <typename T>
+int64_t operator-(const ArrowChunkedArray::Iterator<T>& a,
+                  const ArrowChunkedArray::Iterator<T>& b) {
+  auto full_offset_a = a.array_.chunk_offsets_[a.ptr_chunk_] + a.ptr_offset_;
+  auto full_offset_b = b.array_.chunk_offsets_[b.ptr_chunk_] + b.ptr_offset_;
+  return full_offset_a - full_offset_b;
+}
+
+/* --------------------------------------- INDEX ACCESSOR -------------------------------------- */
+
+/**
+ * @brief The value of "no value" for a primitive type.
+ *
+ * @tparam T The type for which the missing value is defined.
+ * @return T The missing value.
+ */
+template <typename T>
+inline T arrow_primitive_missing_value() {
+  return 0;
+}
+
+template <>
+inline double arrow_primitive_missing_value() {
+  return std::numeric_limits<double>::quiet_NaN();
+}
+
+template <>
+inline float arrow_primitive_missing_value() {
+  return std::numeric_limits<float>::quiet_NaN();
+}
+
+template <typename T, typename V>
+struct ArrayIndexAccessor {
+  V operator()(const ArrowArray* array, size_t idx) {
+    auto buffer_idx = idx + array->offset;
+
+    // For primitive types, buffer at idx 0 provides validity, buffer at idx 1 data, see:
+    // https://arrow.apache.org/docs/format/Columnar.html#buffer-listing-for-each-layout
+    auto validity = static_cast<const char*>(array->buffers[0]);
+
+    // Take return value from data buffer conditional on the validity of the index:
+    //  - The structure of validity bitmasks is taken from here:
+    //    https://arrow.apache.org/docs/format/Columnar.html#validity-bitmaps
+    //  - If the bitmask is NULL, all indices are valid
+    if (validity == nullptr || (validity[buffer_idx / 8] & (1 << (buffer_idx % 8)))) {
+      // In case the index is valid, we take it from the data buffer
+      auto data = static_cast<const T*>(array->buffers[1]);
+      return static_cast<double>(data[buffer_idx]);
+    }
+
+    // In case the index is not valid, we return a default value
+    return arrow_primitive_missing_value<T>();
+  }
+};
+
+template <typename T>
+std::function<T(const ArrowArray*, size_t)> get_index_accessor(const char* dtype) {
+  // Mapping obtained from:
+  // https://arrow.apache.org/docs/format/CDataInterface.html#data-type-description-format-strings
+  switch (dtype[0]) {
+    case 'c':
+      return ArrayIndexAccessor<int8_t, T>();
+    case 'C':
+      return ArrayIndexAccessor<uint8_t, T>();
+    case 's':
+      return ArrayIndexAccessor<int16_t, T>();
+    case 'S':
+      return ArrayIndexAccessor<uint16_t, T>();
+    case 'i':
+      return ArrayIndexAccessor<int32_t, T>();
+    case 'I':
+      return ArrayIndexAccessor<uint32_t, T>();
+    case 'l':
+      return ArrayIndexAccessor<int64_t, T>();
+    case 'L':
+      return ArrayIndexAccessor<uint64_t, T>();
+    case 'f':
+      return ArrayIndexAccessor<float, T>();
+    case 'g':
+      return ArrayIndexAccessor<double, T>();
+    default:
+      throw std::invalid_argument("unsupported Arrow datatype");
+  }
+}
+
+}  // namespace LightGBM
+
+#endif
diff --git a/include/LightGBM/bin.h b/include/LightGBM/bin.h
index ffb8f2844843..de1bb6eb94ed 100644
--- a/include/LightGBM/bin.h
+++ b/include/LightGBM/bin.h
@@ -63,8 +63,7 @@ inline static void Int32HistogramSumReducer(const char* src, char* dst, int type
   const int64_t* src_ptr = reinterpret_cast<const int64_t*>(src);
   int64_t* dst_ptr = reinterpret_cast<int64_t*>(dst);
   const comm_size_t steps = (len + (type_size * 2) - 1) / (type_size * 2);
-  const int num_threads = OMP_NUM_THREADS();
-  #pragma omp parallel for schedule(static) num_threads(num_threads)
+  #pragma omp parallel for schedule(static) num_threads(OMP_NUM_THREADS())
   for (comm_size_t i = 0; i < steps; ++i) {
     dst_ptr[i] += src_ptr[i];
   }
@@ -74,8 +73,7 @@ inline static void Int16HistogramSumReducer(const char* src, char* dst, int type
   const int32_t* src_ptr = reinterpret_cast<const int32_t*>(src);
   int32_t* dst_ptr = reinterpret_cast<int32_t*>(dst);
   const comm_size_t steps = (len + (type_size * 2) - 1) / (type_size * 2);
-  const int num_threads = OMP_NUM_THREADS();
-  #pragma omp parallel for schedule(static) num_threads(num_threads)
+  #pragma omp parallel for schedule(static) num_threads(OMP_NUM_THREADS())
   for (comm_size_t i = 0; i < steps; ++i) {
     dst_ptr[i] += src_ptr[i];
   }
diff --git a/include/LightGBM/c_api.h b/include/LightGBM/c_api.h
index bba46a02a492..b43f096c31ee 100644
--- a/include/LightGBM/c_api.h
+++ b/include/LightGBM/c_api.h
@@ -13,6 +13,7 @@
 #ifndef LIGHTGBM_C_API_H_
 #define LIGHTGBM_C_API_H_
 
+#include <LightGBM/arrow.h>
 #include <LightGBM/export.h>
 
 #ifdef __cplusplus
@@ -437,6 +438,23 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromMats(int32_t nmat,
                                                  const DatasetHandle reference,
                                                  DatasetHandle* out);
 
+/*!
+ * \brief Create dataset from Arrow.
+ * \param n_chunks The number of Arrow arrays passed to this function
+ * \param chunks Pointer to the list of Arrow arrays
+ * \param schema Pointer to the schema of all Arrow arrays
+ * \param parameters Additional parameters
+ * \param reference Used to align bin mapper with other dataset, nullptr means isn't used
+ * \param[out] out Created dataset
+ * \return 0 when succeed, -1 when failure happens
+ */
+LIGHTGBM_C_EXPORT int LGBM_DatasetCreateFromArrow(int64_t n_chunks,
+                                                  const ArrowArray* chunks,
+                                                  const ArrowSchema* schema,
+                                                  const char* parameters,
+                                                  const DatasetHandle reference,
+                                                  DatasetHandle *out);
+
 /*!
  * \brief Create subset of a data.
  * \param handle Handle of full dataset
@@ -537,6 +555,25 @@ LIGHTGBM_C_EXPORT int LGBM_DatasetSetField(DatasetHandle handle,
                                            int num_element,
                                            int type);
 
+/*!
+ * \brief Set vector to a content in info.
+ * \note
+ * - \a group converts input datatype into ``int32``;
+ * - \a label and \a weight convert input datatype into ``float32``;
+ * - \a init_score converts input datatype into ``float64``.
+ * \param handle Handle of dataset
+ * \param field_name Field name, can be \a label, \a weight, \a init_score, \a group
+ * \param n_chunks The number of Arrow arrays passed to this function
+ * \param chunks Pointer to the list of Arrow arrays
+ * \param schema Pointer to the schema of all Arrow arrays
+ * \return 0 when succeed, -1 when failure happens
+ */
+LIGHTGBM_C_EXPORT int LGBM_DatasetSetFieldFromArrow(DatasetHandle handle,
+                                                    const char* field_name,
+                                                    int64_t n_chunks,
+                                                    const ArrowArray* chunks,
+                                                    const ArrowSchema* schema);
+
 /*!
  * \brief Get info vector from dataset.
  * \param handle Handle of dataset
@@ -1380,6 +1417,40 @@ LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForMats(BoosterHandle handle,
                                                  int64_t* out_len,
                                                  double* out_result);
 
+/*!
+ * \brief Make prediction for a new dataset.
+ * \note
+ * You should pre-allocate memory for ``out_result``:
+ *   - for normal and raw score, its length is equal to ``num_class * num_data``;
+ *   - for leaf index, its length is equal to ``num_class * num_data * num_iteration``;
+ *   - for feature contributions, its length is equal to ``num_class * num_data * (num_feature + 1)``.
+ * \param handle Handle of booster
+ * \param n_chunks The number of Arrow arrays passed to this function
+ * \param chunks Pointer to the list of Arrow arrays
+ * \param schema Pointer to the schema of all Arrow arrays
+ * \param predict_type What should be predicted
+ *   - ``C_API_PREDICT_NORMAL``: normal prediction, with transform (if needed);
+ *   - ``C_API_PREDICT_RAW_SCORE``: raw score;
+ *   - ``C_API_PREDICT_LEAF_INDEX``: leaf index;
+ *   - ``C_API_PREDICT_CONTRIB``: feature contributions (SHAP values)
+ * \param start_iteration Start index of the iteration to predict
+ * \param num_iteration Number of iteration for prediction, <= 0 means no limit
+ * \param parameter Other parameters for prediction, e.g. early stopping for prediction
+ * \param[out] out_len Length of output result
+ * \param[out] out_result Pointer to array with predictions
+ * \return 0 when succeed, -1 when failure happens
+ */
+LIGHTGBM_C_EXPORT int LGBM_BoosterPredictForArrow(BoosterHandle handle,
+                                                  int64_t n_chunks,
+                                                  const ArrowArray* chunks,
+                                                  const ArrowSchema* schema,
+                                                  int predict_type,
+                                                  int start_iteration,
+                                                  int num_iteration,
+                                                  const char* parameter,
+                                                  int64_t* out_len,
+                                                  double* out_result);
+
 /*!
  * \brief Save model into file.
  * \param handle Handle of booster
@@ -1524,6 +1595,20 @@ LIGHTGBM_C_EXPORT int LGBM_NetworkInitWithFunctions(int num_machines,
                                                     void* reduce_scatter_ext_fun,
                                                     void* allgather_ext_fun);
 
+/*!
+ * \brief Set maximum number of threads used by LightGBM routines in this process.
+ * \param num_threads maximum number of threads used by LightGBM. -1 means defaulting to omp_get_num_threads().
+ * \return 0 when succeed, -1 when failure happens
+ */
+LIGHTGBM_C_EXPORT int LGBM_SetMaxThreads(int num_threads);
+
+/*!
+ * \brief Get current maximum number of threads used by LightGBM routines in this process.
+ * \param[out] out current maximum number of threads used by LightGBM. -1 means defaulting to omp_get_num_threads().
+ * \return 0 when succeed, -1 when failure happens
+ */
+LIGHTGBM_C_EXPORT int LGBM_GetMaxThreads(int* out);
+
 #if !defined(__cplusplus) && (!defined(__STDC__) || (__STDC_VERSION__ < 199901L))
 /*! \brief Inline specifier no-op in C using standards before C99. */
 #define INLINE_FUNCTION
diff --git a/include/LightGBM/config.h b/include/LightGBM/config.h
index 6d61bc764924..6500cb77272d 100644
--- a/include/LightGBM/config.h
+++ b/include/LightGBM/config.h
@@ -150,7 +150,7 @@ struct Config {
   // descl2 = ``cross_entropy_lambda``, alternative parameterization of cross-entropy, aliases: ``xentlambda``
   // descl2 = label is anything in interval [0, 1]
   // desc = ranking application
-  // descl2 = ``lambdarank``, `lambdarank <https://papers.nips.cc/paper/2971-learning-to-rank-with-nonsmooth-cost-functions.pdf>`__ objective. `label_gain <#label_gain>`__ can be used to set the gain (weight) of ``int`` label and all values in ``label`` must be smaller than number of elements in ``label_gain``
+  // descl2 = ``lambdarank``, `lambdarank <https://proceedings.neurips.cc/paper_files/paper/2006/file/af44c4c56f385c43f2529f9b1b018f6a-Paper.pdf>`__ objective. `label_gain <#label_gain>`__ can be used to set the gain (weight) of ``int`` label and all values in ``label`` must be smaller than number of elements in ``label_gain``
   // descl2 = ``rank_xendcg``, `XE_NDCG_MART <https://arxiv.org/abs/1911.09798>`__ ranking objective function, aliases: ``xendcg``, ``xe_ndcg``, ``xe_ndcg_mart``, ``xendcg_mart``
   // descl2 = ``rank_xendcg`` is faster than and achieves the similar performance as ``lambdarank``
   // descl2 = label should be ``int`` type, and larger number represents the higher relevance (e.g. 0:bad, 1:fair, 2:good, 3:perfect)
@@ -501,14 +501,14 @@ struct Config {
   // desc = used only if ``monotone_constraints`` is set
   // desc = monotone constraints method
   // descl2 = ``basic``, the most basic monotone constraints method. It does not slow the library at all, but over-constrains the predictions
-  // descl2 = ``intermediate``, a `more advanced method <https://hal.archives-ouvertes.fr/hal-02862802/document>`__, which may slow the library very slightly. However, this method is much less constraining than the basic method and should significantly improve the results
-  // descl2 = ``advanced``, an `even more advanced method <https://hal.archives-ouvertes.fr/hal-02862802/document>`__, which may slow the library. However, this method is even less constraining than the intermediate method and should again significantly improve the results
+  // descl2 = ``intermediate``, a `more advanced method <https://hal.science/hal-02862802/document>`__, which may slow the library very slightly. However, this method is much less constraining than the basic method and should significantly improve the results
+  // descl2 = ``advanced``, an `even more advanced method <https://hal.science/hal-02862802/document>`__, which may slow the library. However, this method is even less constraining than the intermediate method and should again significantly improve the results
   std::string monotone_constraints_method = "basic";
 
   // alias = monotone_splits_penalty, ms_penalty, mc_penalty
   // check = >=0.0
   // desc = used only if ``monotone_constraints`` is set
-  // desc = `monotone penalty <https://hal.archives-ouvertes.fr/hal-02862802/document>`__: a penalization parameter X forbids any monotone splits on the first X (rounded down) level(s) of the tree. The penalty applied to monotone splits on a given depth is a continuous, increasing function the penalization parameter
+  // desc = `monotone penalty <https://hal.science/hal-02862802/document>`__: a penalization parameter X forbids any monotone splits on the first X (rounded down) level(s) of the tree. The penalty applied to monotone splits on a given depth is a continuous, increasing function the penalization parameter
   // desc = if ``0.0`` (the default), no penalization is applied
   double monotone_penalty = 0.0;
 
@@ -524,7 +524,7 @@ struct Config {
   // desc = ``.json`` file can be arbitrarily nested, and each split contains ``feature``, ``threshold`` fields, as well as ``left`` and ``right`` fields representing subsplits
   // desc = categorical splits are forced in a one-hot fashion, with ``left`` representing the split containing the feature value and ``right`` representing other values
   // desc = **Note**: the forced split logic will be ignored, if the split makes gain worse
-  // desc = see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/binary_classification/forced_splits.json>`__ as an example
+  // desc = see `this file <https://github.com/microsoft/LightGBM/blob/master/examples/binary_classification/forced_splits.json>`__ as an example
   std::string forcedsplits_filename = "";
 
   // check = >=0.0
@@ -683,7 +683,7 @@ struct Config {
   bool is_enable_sparse = true;
 
   // alias = is_enable_bundle, bundle
-  // desc = set this to ``false`` to disable Exclusive Feature Bundling (EFB), which is described in `LightGBM: A Highly Efficient Gradient Boosting Decision Tree <https://papers.nips.cc/paper/6907-lightgbm-a-highly-efficient-gradient-boosting-decision-tree>`__
+  // desc = set this to ``false`` to disable Exclusive Feature Bundling (EFB), which is described in `LightGBM: A Highly Efficient Gradient Boosting Decision Tree <https://papers.nips.cc/paper_files/paper/2017/hash/6449f44a102fde848669bdd9eb6b76fa-Abstract.html>`__
   // desc = **Note**: disabling this may cause the slow training speed for sparse datasets
   bool enable_bundle = true;
 
@@ -770,7 +770,7 @@ struct Config {
 
   // desc = path to a ``.json`` file that specifies bin upper bounds for some or all features
   // desc = ``.json`` file should contain an array of objects, each containing the word ``feature`` (integer feature index) and ``bin_upper_bound`` (array of thresholds for binning)
-  // desc = see `this file <https://github.com/microsoft/LightGBM/tree/master/examples/regression/forced_bins.json>`__ as an example
+  // desc = see `this file <https://github.com/microsoft/LightGBM/blob/master/examples/regression/forced_bins.json>`__ as an example
   std::string forcedbins_filename = "";
 
   // [no-save]
@@ -826,7 +826,7 @@ struct Config {
   // desc = used only in ``prediction`` task
   // desc = set this to ``true`` to estimate `SHAP values <https://arxiv.org/abs/1706.06060>`__, which represent how each feature contributes to each prediction
   // desc = produces ``#features + 1`` values where the last value is the expected value of the model output over the training data
-  // desc = **Note**: if you want to get more explanation for your model's predictions using SHAP values like SHAP interaction values, you can install `shap package <https://github.com/slundberg/shap>`__
+  // desc = **Note**: if you want to get more explanation for your model's predictions using SHAP values like SHAP interaction values, you can install `shap package <https://github.com/shap>`__
   // desc = **Note**: unlike the shap package, with ``predict_contrib`` we return a matrix with an extra column, where the last column is the expected value
   // desc = **Note**: this feature is not implemented for linear trees
   bool predict_contrib = false;
diff --git a/include/LightGBM/cuda/cuda_algorithms.hpp b/include/LightGBM/cuda/cuda_algorithms.hpp
index ab3328bb5561..f79fc57e4f42 100644
--- a/include/LightGBM/cuda/cuda_algorithms.hpp
+++ b/include/LightGBM/cuda/cuda_algorithms.hpp
@@ -13,7 +13,7 @@
 #include <stdio.h>
 
 #include <LightGBM/bin.h>
-#include <LightGBM/cuda/cuda_utils.h>
+#include <LightGBM/cuda/cuda_utils.hu>
 #include <LightGBM/utils/log.h>
 
 #include <algorithm>
diff --git a/include/LightGBM/cuda/cuda_column_data.hpp b/include/LightGBM/cuda/cuda_column_data.hpp
index 5b2301ac8de3..314a178859c6 100644
--- a/include/LightGBM/cuda/cuda_column_data.hpp
+++ b/include/LightGBM/cuda/cuda_column_data.hpp
@@ -9,7 +9,7 @@
 #define LIGHTGBM_CUDA_CUDA_COLUMN_DATA_HPP_
 
 #include <LightGBM/config.h>
-#include <LightGBM/cuda/cuda_utils.h>
+#include <LightGBM/cuda/cuda_utils.hu>
 #include <LightGBM/bin.h>
 #include <LightGBM/utils/openmp_wrapper.h>
 
diff --git a/include/LightGBM/cuda/cuda_metadata.hpp b/include/LightGBM/cuda/cuda_metadata.hpp
index bc7339a84bf7..5882ce7e05c9 100644
--- a/include/LightGBM/cuda/cuda_metadata.hpp
+++ b/include/LightGBM/cuda/cuda_metadata.hpp
@@ -8,7 +8,7 @@
 #ifndef LIGHTGBM_CUDA_CUDA_METADATA_HPP_
 #define LIGHTGBM_CUDA_CUDA_METADATA_HPP_
 
-#include <LightGBM/cuda/cuda_utils.h>
+#include <LightGBM/cuda/cuda_utils.hu>
 #include <LightGBM/meta.h>
 
 #include <vector>
diff --git a/include/LightGBM/cuda/cuda_metric.hpp b/include/LightGBM/cuda/cuda_metric.hpp
index 9186ceea160b..2540b0c1a835 100644
--- a/include/LightGBM/cuda/cuda_metric.hpp
+++ b/include/LightGBM/cuda/cuda_metric.hpp
@@ -9,7 +9,7 @@
 
 #ifdef USE_CUDA
 
-#include <LightGBM/cuda/cuda_utils.h>
+#include <LightGBM/cuda/cuda_utils.hu>
 #include <LightGBM/metric.h>
 
 namespace LightGBM {
diff --git a/include/LightGBM/cuda/cuda_objective_function.hpp b/include/LightGBM/cuda/cuda_objective_function.hpp
index fae8aa7ec643..465ed334156c 100644
--- a/include/LightGBM/cuda/cuda_objective_function.hpp
+++ b/include/LightGBM/cuda/cuda_objective_function.hpp
@@ -9,7 +9,7 @@
 
 #ifdef USE_CUDA
 
-#include <LightGBM/cuda/cuda_utils.h>
+#include <LightGBM/cuda/cuda_utils.hu>
 #include <LightGBM/objective_function.h>
 #include <LightGBM/meta.h>
 
diff --git a/include/LightGBM/cuda/cuda_row_data.hpp b/include/LightGBM/cuda/cuda_row_data.hpp
index 0386db0dc300..1d4cb2f73b1e 100644
--- a/include/LightGBM/cuda/cuda_row_data.hpp
+++ b/include/LightGBM/cuda/cuda_row_data.hpp
@@ -10,7 +10,7 @@
 
 #include <LightGBM/bin.h>
 #include <LightGBM/config.h>
-#include <LightGBM/cuda/cuda_utils.h>
+#include <LightGBM/cuda/cuda_utils.hu>
 #include <LightGBM/dataset.h>
 #include <LightGBM/train_share_states.h>
 #include <LightGBM/utils/openmp_wrapper.h>
diff --git a/include/LightGBM/cuda/cuda_split_info.hpp b/include/LightGBM/cuda/cuda_split_info.hpp
index 46b35ca37a59..f01ce2b02a02 100644
--- a/include/LightGBM/cuda/cuda_split_info.hpp
+++ b/include/LightGBM/cuda/cuda_split_info.hpp
@@ -24,12 +24,14 @@ class CUDASplitInfo {
 
   double left_sum_gradients;
   double left_sum_hessians;
+  int64_t left_sum_of_gradients_hessians;
   data_size_t left_count;
   double left_gain;
   double left_value;
 
   double right_sum_gradients;
   double right_sum_hessians;
+  int64_t right_sum_of_gradients_hessians;
   data_size_t right_count;
   double right_gain;
   double right_value;
diff --git a/include/LightGBM/cuda/cuda_utils.h b/include/LightGBM/cuda/cuda_utils.hu
similarity index 91%
rename from include/LightGBM/cuda/cuda_utils.h
rename to include/LightGBM/cuda/cuda_utils.hu
index 953bf9f12e88..4bd84aeb264d 100644
--- a/include/LightGBM/cuda/cuda_utils.h
+++ b/include/LightGBM/cuda/cuda_utils.hu
@@ -7,15 +7,21 @@
 #define LIGHTGBM_CUDA_CUDA_UTILS_H_
 
 #ifdef USE_CUDA
+
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <stdio.h>
+
 #include <LightGBM/utils/log.h>
+
+#include <algorithm>
 #include <vector>
 #include <cmath>
 
 namespace LightGBM {
 
+typedef unsigned long long atomic_add_long_t;
+
 #define CUDASUCCESS_OR_FATAL(ans) { gpuAssert((ans), __FILE__, __LINE__); }
 inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) {
   if (code != cudaSuccess) {
@@ -125,13 +131,19 @@ class CUDAVector {
     T* new_data = nullptr;
     AllocateCUDAMemory<T>(&new_data, size, __FILE__, __LINE__);
     if (size_ > 0 && data_ != nullptr) {
-      CopyFromCUDADeviceToCUDADevice<T>(new_data, data_, size, __FILE__, __LINE__);
+      const size_t size_for_old_content = std::min<size_t>(size_, size);
+      CopyFromCUDADeviceToCUDADevice<T>(new_data, data_, size_for_old_content, __FILE__, __LINE__);
     }
     DeallocateCUDAMemory<T>(&data_, __FILE__, __LINE__);
     data_ = new_data;
     size_ = size;
   }
 
+  void InitFromHostVector(const std::vector<T>& host_vector) {
+    Resize(host_vector.size());
+    CopyFromHostToCUDADevice(data_, host_vector.data(), host_vector.size(), __FILE__, __LINE__);
+  }
+
   void Clear() {
     if (size_ > 0 && data_ != nullptr) {
       DeallocateCUDAMemory<T>(&data_, __FILE__, __LINE__);
@@ -171,6 +183,10 @@ class CUDAVector {
     return data_;
   }
 
+  void SetValue(int value) {
+    SetCUDAMemory<T>(data_, value, size_, __FILE__, __LINE__);
+  }
+
   const T* RawDataReadOnly() const {
     return data_;
   }
diff --git a/include/LightGBM/dataset.h b/include/LightGBM/dataset.h
index e7baa42dc2e6..220a1f9f009c 100644
--- a/include/LightGBM/dataset.h
+++ b/include/LightGBM/dataset.h
@@ -5,6 +5,7 @@
 #ifndef LIGHTGBM_DATASET_H_
 #define LIGHTGBM_DATASET_H_
 
+#include <LightGBM/arrow.h>
 #include <LightGBM/config.h>
 #include <LightGBM/feature_group.h>
 #include <LightGBM/meta.h>
@@ -109,10 +110,13 @@ class Metadata {
                         const std::vector<data_size_t>& used_data_indices);
 
   void SetLabel(const label_t* label, data_size_t len);
+  void SetLabel(const ArrowChunkedArray& array);
 
   void SetWeights(const label_t* weights, data_size_t len);
+  void SetWeights(const ArrowChunkedArray& array);
 
   void SetQuery(const data_size_t* query, data_size_t len);
+  void SetQuery(const ArrowChunkedArray& array);
 
   void SetPosition(const data_size_t* position, data_size_t len);
 
@@ -121,6 +125,7 @@ class Metadata {
   * \param init_score Initial scores, this class will manage memory for init_score.
   */
   void SetInitScore(const double* init_score, data_size_t len);
+  void SetInitScore(const ArrowChunkedArray& array);
 
 
   /*!
@@ -333,12 +338,24 @@ class Metadata {
   void CalculateQueryBoundaries();
   /*! \brief Insert labels at the given index */
   void InsertLabels(const label_t* labels, data_size_t start_index, data_size_t len);
+  /*! \brief Set labels from pointers to the first element and the end of an iterator. */
+  template <typename It>
+  void SetLabelsFromIterator(It first, It last);
   /*! \brief Insert weights at the given index */
   void InsertWeights(const label_t* weights, data_size_t start_index, data_size_t len);
+  /*! \brief Set weights from pointers to the first element and the end of an iterator. */
+  template <typename It>
+  void SetWeightsFromIterator(It first, It last);
   /*! \brief Insert initial scores at the given index */
   void InsertInitScores(const double* init_scores, data_size_t start_index, data_size_t len, data_size_t source_size);
+  /*! \brief Set init scores from pointers to the first element and the end of an iterator. */
+  template <typename It>
+  void SetInitScoresFromIterator(It first, It last);
   /*! \brief Insert queries at the given index */
   void InsertQueries(const data_size_t* queries, data_size_t start_index, data_size_t len);
+  /*! \brief Set queries from pointers to the first element and the end of an iterator. */
+  template <typename It>
+  void SetQueriesFromIterator(It first, It last);
   /*! \brief Filename of current data */
   std::string data_filename_;
   /*! \brief Number of data */
@@ -545,24 +562,29 @@ class Dataset {
     }
   }
 
-  inline void PushOneRow(int tid, data_size_t row_idx, const std::vector<double>& feature_values) {
-    if (is_finish_load_) { return; }
-    for (size_t i = 0; i < feature_values.size() && i < static_cast<size_t>(num_total_features_); ++i) {
-      int feature_idx = used_feature_map_[i];
-      if (feature_idx >= 0) {
-        const int group = feature2group_[feature_idx];
-        const int sub_feature = feature2subfeature_[feature_idx];
-        feature_groups_[group]->PushData(tid, sub_feature, row_idx, feature_values[i]);
-        if (has_raw_) {
-          int feat_ind = numeric_feature_map_[feature_idx];
-          if (feat_ind >= 0) {
-            raw_data_[feat_ind][row_idx] = static_cast<float>(feature_values[i]);
-          }
+  inline void PushOneValue(int tid, data_size_t row_idx, size_t col_idx, double value) {
+    if (this->is_finish_load_)
+      return;
+    auto feature_idx = this->used_feature_map_[col_idx];
+    if (feature_idx >= 0) {
+      auto group = this->feature2group_[feature_idx];
+      auto sub_feature = this->feature2subfeature_[feature_idx];
+      this->feature_groups_[group]->PushData(tid, sub_feature, row_idx, value);
+      if (this->has_raw_) {
+        auto feat_ind = numeric_feature_map_[feature_idx];
+        if (feat_ind >= 0) {
+          raw_data_[feat_ind][row_idx] = static_cast<float>(value);
         }
       }
     }
   }
 
+  inline void PushOneRow(int tid, data_size_t row_idx, const std::vector<double>& feature_values) {
+    for (size_t i = 0; i < feature_values.size() && i < static_cast<size_t>(num_total_features_); ++i) {
+      this->PushOneValue(tid, row_idx, i, feature_values[i]);
+    }
+  }
+
   inline void PushOneRow(int tid, data_size_t row_idx, const std::vector<std::pair<int, double>>& feature_values) {
     if (is_finish_load_) { return; }
     std::vector<bool> is_feature_added(num_features_, false);
@@ -649,6 +671,8 @@ class Dataset {
 
   LIGHTGBM_EXPORT void FinishLoad();
 
+  bool SetFieldFromArrow(const char* field_name, const ArrowChunkedArray& ca);
+
   LIGHTGBM_EXPORT bool SetFloatField(const char* field_name, const float* field_data, data_size_t num_element);
 
   LIGHTGBM_EXPORT bool SetDoubleField(const char* field_name, const double* field_data, data_size_t num_element);
diff --git a/include/LightGBM/feature_group.h b/include/LightGBM/feature_group.h
index 003df70afad7..f13a5fff966f 100644
--- a/include/LightGBM/feature_group.h
+++ b/include/LightGBM/feature_group.h
@@ -361,7 +361,7 @@ class FeatureGroup {
   inline void FinishLoad() {
     if (is_multi_val_) {
       OMP_INIT_EX();
-#pragma omp parallel for schedule(guided)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(guided)
       for (int i = 0; i < num_feature_; ++i) {
         OMP_LOOP_EX_BEGIN();
         multi_bin_data_[i]->FinishLoad();
diff --git a/include/LightGBM/sample_strategy.h b/include/LightGBM/sample_strategy.h
index 51d3cbc16f23..4ea5cfc5f436 100644
--- a/include/LightGBM/sample_strategy.h
+++ b/include/LightGBM/sample_strategy.h
@@ -6,7 +6,7 @@
 #ifndef LIGHTGBM_SAMPLE_STRATEGY_H_
 #define LIGHTGBM_SAMPLE_STRATEGY_H_
 
-#include <LightGBM/cuda/cuda_utils.h>
+#include <LightGBM/cuda/cuda_utils.hu>
 #include <LightGBM/utils/random.h>
 #include <LightGBM/utils/common.h>
 #include <LightGBM/utils/threading.h>
diff --git a/include/LightGBM/tree.h b/include/LightGBM/tree.h
index 13b3c41a2309..0c4a41f46a87 100644
--- a/include/LightGBM/tree.h
+++ b/include/LightGBM/tree.h
@@ -185,7 +185,7 @@ class Tree {
   * \param rate The factor of shrinkage
   */
   virtual inline void Shrinkage(double rate) {
-#pragma omp parallel for schedule(static, 1024) if (num_leaves_ >= 2048)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 1024) if (num_leaves_ >= 2048)
     for (int i = 0; i < num_leaves_ - 1; ++i) {
       leaf_value_[i] = MaybeRoundToZero(leaf_value_[i] * rate);
       internal_value_[i] = MaybeRoundToZero(internal_value_[i] * rate);
@@ -210,7 +210,7 @@ class Tree {
   inline double shrinkage() const { return shrinkage_; }
 
   virtual inline void AddBias(double val) {
-#pragma omp parallel for schedule(static, 1024) if (num_leaves_ >= 2048)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 1024) if (num_leaves_ >= 2048)
     for (int i = 0; i < num_leaves_ - 1; ++i) {
       leaf_value_[i] = MaybeRoundToZero(leaf_value_[i] + val);
       internal_value_[i] = MaybeRoundToZero(internal_value_[i] + val);
@@ -218,7 +218,7 @@ class Tree {
     leaf_value_[num_leaves_ - 1] =
         MaybeRoundToZero(leaf_value_[num_leaves_ - 1] + val);
     if (is_linear_) {
-#pragma omp parallel for schedule(static, 1024) if (num_leaves_ >= 2048)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 1024) if (num_leaves_ >= 2048)
       for (int i = 0; i < num_leaves_ - 1; ++i) {
         leaf_const_[i] = MaybeRoundToZero(leaf_const_[i] + val);
       }
diff --git a/include/LightGBM/utils/common.h b/include/LightGBM/utils/common.h
index 4e82734858f3..f1b5a10b5a69 100644
--- a/include/LightGBM/utils/common.h
+++ b/include/LightGBM/utils/common.h
@@ -691,7 +691,7 @@ static void ParallelSort(_RanIt _First, _RanIt _Last, _Pr _Pred, _VTRanIt*) {
   size_t inner_size = (len + num_threads - 1) / num_threads;
   inner_size = std::max(inner_size, kMinInnerLen);
   num_threads = static_cast<int>((len + inner_size - 1) / inner_size);
-#pragma omp parallel for schedule(static, 1)
+#pragma omp parallel for num_threads(num_threads) schedule(static, 1)
   for (int i = 0; i < num_threads; ++i) {
     size_t left = inner_size*i;
     size_t right = left + inner_size;
@@ -707,7 +707,7 @@ static void ParallelSort(_RanIt _First, _RanIt _Last, _Pr _Pred, _VTRanIt*) {
   // Recursive merge
   while (s < len) {
     int loop_size = static_cast<int>((len + s * 2 - 1) / (s * 2));
-    #pragma omp parallel for schedule(static, 1)
+    #pragma omp parallel for num_threads(num_threads) schedule(static, 1)
     for (int i = 0; i < loop_size; ++i) {
       size_t left = i * 2 * s;
       size_t mid = left + s;
diff --git a/include/LightGBM/utils/openmp_wrapper.h b/include/LightGBM/utils/openmp_wrapper.h
index a337fc353b75..b9a8ea2982fc 100644
--- a/include/LightGBM/utils/openmp_wrapper.h
+++ b/include/LightGBM/utils/openmp_wrapper.h
@@ -5,6 +5,15 @@
 #ifndef LIGHTGBM_OPENMP_WRAPPER_H_
 #define LIGHTGBM_OPENMP_WRAPPER_H_
 
+#include <LightGBM/export.h>
+
+// this can only be changed by LGBM_SetMaxThreads()
+LIGHTGBM_EXTERN_C int LGBM_MAX_NUM_THREADS;
+
+// this is modified by OMP_SET_NUM_THREADS(), for example
+// by passing num_thread through params
+LIGHTGBM_EXTERN_C int LGBM_DEFAULT_NUM_THREADS;
+
 #ifdef _OPENMP
 
 #include <LightGBM/utils/log.h>
@@ -17,22 +26,25 @@
 #include <stdexcept>
 #include <vector>
 
-inline int OMP_NUM_THREADS() {
-  int ret = 1;
-#pragma omp parallel
-#pragma omp master
-  { ret = omp_get_num_threads(); }
-  return ret;
-}
-
-inline void OMP_SET_NUM_THREADS(int num_threads) {
-  static const int default_omp_num_threads = OMP_NUM_THREADS();
-  if (num_threads > 0) {
-    omp_set_num_threads(num_threads);
-  } else {
-    omp_set_num_threads(default_omp_num_threads);
-  }
-}
+/*
+    Get number of threads to use in OpenMP parallel regions.
+
+    By default, this will return the result of omp_get_max_threads(),
+    which is OpenMP-implementation dependent but generally can be controlled
+    by environment variable OMP_NUM_THREADS.
+
+    ref:
+      - https://www.openmp.org/spec-html/5.0/openmpsu112.html
+      - https://gcc.gnu.org/onlinedocs/libgomp/omp_005fget_005fmax_005fthreads.html
+*/
+LIGHTGBM_EXTERN_C int OMP_NUM_THREADS();
+
+/*
+    Update the default number of threads that'll be used in OpenMP parallel
+    regions for LightGBM routines where the number of threads aren't directly
+    supplied.
+*/
+LIGHTGBM_EXTERN_C void OMP_SET_NUM_THREADS(int num_threads);
 
 class ThreadExceptionHelper {
  public:
@@ -102,10 +114,7 @@ class ThreadExceptionHelper {
   /** Fall here if no OPENMP support, so just
       simulate a single thread running.
       All #pragma omp should be ignored by the compiler **/
-  inline void omp_set_num_threads(int) __GOMP_NOTHROW {}  // NOLINT (no cast done here)
   inline void OMP_SET_NUM_THREADS(int) __GOMP_NOTHROW {}
-  inline int omp_get_num_threads() __GOMP_NOTHROW {return 1;}
-  inline int omp_get_max_threads() __GOMP_NOTHROW {return 1;}
   inline int omp_get_thread_num() __GOMP_NOTHROW {return 0;}
   inline int OMP_NUM_THREADS() __GOMP_NOTHROW { return 1; }
 #ifdef __cplusplus
diff --git a/include/LightGBM/utils/threading.h b/include/LightGBM/utils/threading.h
index a093f87c1c8b..41362ee5db49 100644
--- a/include/LightGBM/utils/threading.h
+++ b/include/LightGBM/utils/threading.h
@@ -73,7 +73,7 @@ class Threading {
     INDEX_T num_inner = end - start;
     BlockInfo<INDEX_T>(num_inner, min_block_size, &n_block, &num_inner);
     OMP_INIT_EX();
-#pragma omp parallel for schedule(static, 1)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 1)
     for (int i = 0; i < n_block; ++i) {
       OMP_LOOP_EX_BEGIN();
       INDEX_T inner_start = start + num_inner * i;
diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py
index dac3224a54b2..19a53c6bb70f 100644
--- a/python-package/lightgbm/basic.py
+++ b/python-package/lightgbm/basic.py
@@ -18,12 +18,21 @@
 import numpy as np
 import scipy.sparse
 
-from .compat import PANDAS_INSTALLED, concat, dt_DataTable, pd_CategoricalDtype, pd_DataFrame, pd_Series
+from .compat import (PANDAS_INSTALLED, PYARROW_INSTALLED, arrow_cffi, arrow_is_floating, arrow_is_integer, concat,
+                     dt_DataTable, pa_Array, pa_chunked_array, pa_ChunkedArray, pa_compute, pa_Table,
+                     pd_CategoricalDtype, pd_DataFrame, pd_Series)
 from .libpath import find_lib_path
 
 if TYPE_CHECKING:
     from typing import Literal
 
+    # typing.TypeGuard was only introduced in Python 3.10
+    try:
+        from typing import TypeGuard
+    except ImportError:
+        from typing_extensions import TypeGuard
+
+
 __all__ = [
     'Booster',
     'Dataset',
@@ -61,7 +70,9 @@
     List[float],
     List[int],
     np.ndarray,
-    pd_Series
+    pd_Series,
+    pa_Array,
+    pa_ChunkedArray,
 ]
 _LGBM_PositionType = Union[
     np.ndarray,
@@ -73,6 +84,9 @@
     np.ndarray,
     pd_Series,
     pd_DataFrame,
+    pa_Table,
+    pa_Array,
+    pa_ChunkedArray,
 ]
 _LGBM_TrainDataType = Union[
     str,
@@ -83,14 +97,17 @@
     scipy.sparse.spmatrix,
     "Sequence",
     List["Sequence"],
-    List[np.ndarray]
+    List[np.ndarray],
+    pa_Table
 ]
 _LGBM_LabelType = Union[
     List[float],
     List[int],
     np.ndarray,
     pd_Series,
-    pd_DataFrame
+    pd_DataFrame,
+    pa_Array,
+    pa_ChunkedArray,
 ]
 _LGBM_PredictDataType = Union[
     str,
@@ -98,13 +115,16 @@
     np.ndarray,
     pd_DataFrame,
     dt_DataTable,
-    scipy.sparse.spmatrix
+    scipy.sparse.spmatrix,
+    pa_Table,
 ]
 _LGBM_WeightType = Union[
     List[float],
     List[int],
     np.ndarray,
-    pd_Series
+    pd_Series,
+    pa_Array,
+    pa_ChunkedArray,
 ]
 ZERO_THRESHOLD = 1e-35
 
@@ -279,6 +299,20 @@ def _is_1d_list(data: Any) -> bool:
     return isinstance(data, list) and (not data or _is_numeric(data[0]))
 
 
+def _is_list_of_numpy_arrays(data: Any) -> "TypeGuard[List[np.ndarray]]":
+    return (
+        isinstance(data, list)
+        and all(isinstance(x, np.ndarray) for x in data)
+    )
+
+
+def _is_list_of_sequences(data: Any) -> "TypeGuard[List[Sequence]]":
+    return (
+        isinstance(data, list)
+        and all(isinstance(x, Sequence) for x in data)
+    )
+
+
 def _is_1d_collection(data: Any) -> bool:
     """Check whether data is a 1-D collection."""
     return (
@@ -330,6 +364,68 @@ def _is_2d_collection(data: Any) -> bool:
     )
 
 
+def _is_pyarrow_array(data: Any) -> bool:
+    """Check whether data is a PyArrow array."""
+    return isinstance(data, (pa_Array, pa_ChunkedArray))
+
+
+def _is_pyarrow_table(data: Any) -> bool:
+    """Check whether data is a PyArrow table."""
+    return isinstance(data, pa_Table)
+
+
+class _ArrowCArray:
+    """Simple wrapper around the C representation of an Arrow type."""
+
+    n_chunks: int
+    chunks: arrow_cffi.CData
+    schema: arrow_cffi.CData
+
+    def __init__(self, n_chunks: int, chunks: arrow_cffi.CData, schema: arrow_cffi.CData):
+        self.n_chunks = n_chunks
+        self.chunks = chunks
+        self.schema = schema
+
+    @property
+    def chunks_ptr(self) -> int:
+        """Returns the address of the pointer to the list of chunks making up the array."""
+        return int(arrow_cffi.cast("uintptr_t", arrow_cffi.addressof(self.chunks[0])))
+
+    @property
+    def schema_ptr(self) -> int:
+        """Returns the address of the pointer to the schema of the array."""
+        return int(arrow_cffi.cast("uintptr_t", self.schema))
+
+
+def _export_arrow_to_c(data: pa_Table) -> _ArrowCArray:
+    """Export an Arrow type to its C representation."""
+    # Obtain objects to export
+    if isinstance(data, pa_Array):
+        export_objects = [data]
+    elif isinstance(data, pa_ChunkedArray):
+        export_objects = data.chunks
+    elif isinstance(data, pa_Table):
+        export_objects = data.to_batches()
+    else:
+        raise ValueError(f"data of type '{type(data)}' cannot be exported to Arrow")
+
+    # Prepare export
+    chunks = arrow_cffi.new("struct ArrowArray[]", len(export_objects))
+    schema = arrow_cffi.new("struct ArrowSchema*")
+
+    # Export all objects
+    for i, obj in enumerate(export_objects):
+        chunk_ptr = int(arrow_cffi.cast("uintptr_t", arrow_cffi.addressof(chunks[i])))
+        if i == 0:
+            schema_ptr = int(arrow_cffi.cast("uintptr_t", schema))
+            obj._export_to_c(chunk_ptr, schema_ptr)
+        else:
+            obj._export_to_c(chunk_ptr)
+
+    return _ArrowCArray(len(chunks), chunks, schema)
+
+
+
 def _data_to_2d_numpy(
     data: Any,
     dtype: "np.typing.DTypeLike",
@@ -347,7 +443,7 @@ def _data_to_2d_numpy(
                     "It should be list of lists, numpy 2-D array or pandas DataFrame")
 
 
-def _cfloat32_array_to_numpy(cptr: "ctypes._Pointer", length: int) -> np.ndarray:
+def _cfloat32_array_to_numpy(*, cptr: "ctypes._Pointer", length: int) -> np.ndarray:
     """Convert a ctypes float pointer array to a numpy array."""
     if isinstance(cptr, ctypes.POINTER(ctypes.c_float)):
         return np.ctypeslib.as_array(cptr, shape=(length,)).copy()
@@ -355,7 +451,7 @@ def _cfloat32_array_to_numpy(cptr: "ctypes._Pointer", length: int) -> np.ndarray
         raise RuntimeError('Expected float pointer')
 
 
-def _cfloat64_array_to_numpy(cptr: "ctypes._Pointer", length: int) -> np.ndarray:
+def _cfloat64_array_to_numpy(*, cptr: "ctypes._Pointer", length: int) -> np.ndarray:
     """Convert a ctypes double pointer array to a numpy array."""
     if isinstance(cptr, ctypes.POINTER(ctypes.c_double)):
         return np.ctypeslib.as_array(cptr, shape=(length,)).copy()
@@ -363,7 +459,7 @@ def _cfloat64_array_to_numpy(cptr: "ctypes._Pointer", length: int) -> np.ndarray
         raise RuntimeError('Expected double pointer')
 
 
-def _cint32_array_to_numpy(cptr: "ctypes._Pointer", length: int) -> np.ndarray:
+def _cint32_array_to_numpy(*, cptr: "ctypes._Pointer", length: int) -> np.ndarray:
     """Convert a ctypes int pointer array to a numpy array."""
     if isinstance(cptr, ctypes.POINTER(ctypes.c_int32)):
         return np.ctypeslib.as_array(cptr, shape=(length,)).copy()
@@ -371,7 +467,7 @@ def _cint32_array_to_numpy(cptr: "ctypes._Pointer", length: int) -> np.ndarray:
         raise RuntimeError('Expected int32 pointer')
 
 
-def _cint64_array_to_numpy(cptr: "ctypes._Pointer", length: int) -> np.ndarray:
+def _cint64_array_to_numpy(*, cptr: "ctypes._Pointer", length: int) -> np.ndarray:
     """Convert a ctypes int pointer array to a numpy array."""
     if isinstance(cptr, ctypes.POINTER(ctypes.c_int64)):
         return np.ctypeslib.as_array(cptr, shape=(length,)).copy()
@@ -458,7 +554,7 @@ def _get_all_param_aliases() -> Dict[str, List[str]]:
         buffer_len = 1 << 20
         tmp_out_len = ctypes.c_int64(0)
         string_buffer = ctypes.create_string_buffer(buffer_len)
-        ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)])
+        ptr_string_buffer = ctypes.c_char_p(ctypes.addressof(string_buffer))
         _safe_call(_LIB.LGBM_DumpParamAliases(
             ctypes.c_int64(buffer_len),
             ctypes.byref(tmp_out_len),
@@ -467,7 +563,7 @@ def _get_all_param_aliases() -> Dict[str, List[str]]:
         # if buffer length is not long enough, re-allocate a buffer
         if actual_len > buffer_len:
             string_buffer = ctypes.create_string_buffer(actual_len)
-            ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)])
+            ptr_string_buffer = ctypes.c_char_p(ctypes.addressof(string_buffer))
             _safe_call(_LIB.LGBM_DumpParamAliases(
                 ctypes.c_int64(actual_len),
                 ctypes.byref(tmp_out_len),
@@ -668,22 +764,43 @@ def _check_for_bad_pandas_dtypes(pandas_dtypes_series: pd_Series) -> None:
                          f'Fields with bad pandas dtypes: {", ".join(bad_pandas_dtypes)}')
 
 
+def _pandas_to_numpy(
+    data: pd_DataFrame,
+    target_dtype: "np.typing.DTypeLike"
+) -> np.ndarray:
+    _check_for_bad_pandas_dtypes(data.dtypes)
+    try:
+        # most common case (no nullable dtypes)
+        return data.to_numpy(dtype=target_dtype, copy=False)
+    except TypeError:
+        # 1.0 <= pd version < 1.1 and nullable dtypes, least common case
+        # raises error because array is casted to type(pd.NA) and there's no na_value argument
+        return data.astype(target_dtype, copy=False).values
+    except ValueError:
+        # data has nullable dtypes, but we can specify na_value argument and copy will be made
+        return data.to_numpy(dtype=target_dtype, na_value=np.nan)
+
+
 def _data_from_pandas(
     data: pd_DataFrame,
     feature_name: _LGBM_FeatureNameConfiguration,
     categorical_feature: _LGBM_CategoricalFeatureConfiguration,
     pandas_categorical: Optional[List[List]]
-) -> Tuple[np.ndarray, List[str], List[str], List[List]]:
+) -> Tuple[np.ndarray, List[str], Union[List[str], List[int]], List[List]]:
     if len(data.shape) != 2 or data.shape[0] < 1:
         raise ValueError('Input data must be 2 dimensional and non empty.')
 
+    # take shallow copy in case we modify categorical columns
+    # whole column modifications don't change the original df
+    data = data.copy(deep=False)
+
     # determine feature names
     if feature_name == 'auto':
         feature_name = [str(col) for col in data.columns]
 
     # determine categorical features
     cat_cols = [col for col, dtype in zip(data.columns, data.dtypes) if isinstance(dtype, pd_CategoricalDtype)]
-    cat_cols_not_ordered = [col for col in cat_cols if not data[col].cat.ordered]
+    cat_cols_not_ordered: List[str] = [col for col in cat_cols if not data[col].cat.ordered]
     if pandas_categorical is None:  # train dataset
         pandas_categorical = [list(data[col].cat.categories) for col in cat_cols]
     else:
@@ -693,29 +810,23 @@ def _data_from_pandas(
             if list(data[col].cat.categories) != list(category):
                 data[col] = data[col].cat.set_categories(category)
     if len(cat_cols):  # cat_cols is list
-        data = data.copy(deep=False)  # not alter origin DataFrame
         data[cat_cols] = data[cat_cols].apply(lambda x: x.cat.codes).replace({-1: np.nan})
-    if categorical_feature == 'auto':  # use cat cols from DataFrame
+
+    # use cat cols from DataFrame
+    if categorical_feature == 'auto':
         categorical_feature = cat_cols_not_ordered
-    else:  # use cat cols specified by user
-        categorical_feature = list(categorical_feature)  # type: ignore[assignment]
 
-    # get numpy representation of the data
-    _check_for_bad_pandas_dtypes(data.dtypes)
     df_dtypes = [dtype.type for dtype in data.dtypes]
-    df_dtypes.append(np.float32)  # so that the target dtype considers floats
+    # so that the target dtype considers floats
+    df_dtypes.append(np.float32)
     target_dtype = np.result_type(*df_dtypes)
-    try:
-        # most common case (no nullable dtypes)
-        data = data.to_numpy(dtype=target_dtype, copy=False)
-    except TypeError:
-        # 1.0 <= pd version < 1.1 and nullable dtypes, least common case
-        # raises error because array is casted to type(pd.NA) and there's no na_value argument
-        data = data.astype(target_dtype, copy=False).values
-    except ValueError:
-        # data has nullable dtypes, but we can specify na_value argument and copy will be made
-        data = data.to_numpy(dtype=target_dtype, na_value=np.nan)
-    return data, feature_name, categorical_feature, pandas_categorical
+
+    return (
+        _pandas_to_numpy(data, target_dtype=target_dtype),
+        feature_name,
+        categorical_feature,
+        pandas_categorical
+    )
 
 
 def _dump_pandas_categorical(
@@ -959,7 +1070,7 @@ def predict(
 
         Parameters
         ----------
-        data : str, pathlib.Path, numpy array, pandas DataFrame, H2O DataTable's Frame or scipy.sparse
+        data : str, pathlib.Path, numpy array, pandas DataFrame, pyarrow Table, H2O DataTable's Frame or scipy.sparse
             Data source for prediction.
             If str or pathlib.Path, it represents the path to a text file (CSV, TSV, or LibSVM).
         start_iteration : int, optional (default=0)
@@ -1051,6 +1162,13 @@ def predict(
                 num_iteration=num_iteration,
                 predict_type=predict_type
             )
+        elif _is_pyarrow_table(data):
+            preds, nrow = self.__pred_for_pyarrow_table(
+                table=data,
+                start_iteration=start_iteration,
+                num_iteration=num_iteration,
+                predict_type=predict_type
+            )
         elif isinstance(data, list):
             try:
                 data = np.array(data)
@@ -1208,18 +1326,18 @@ def __create_sparse_native(
         data_indices_len = out_shape[0]
         indptr_len = out_shape[1]
         if indptr_type == _C_API_DTYPE_INT32:
-            out_indptr = _cint32_array_to_numpy(out_ptr_indptr, indptr_len)
+            out_indptr = _cint32_array_to_numpy(cptr=out_ptr_indptr, length=indptr_len)
         elif indptr_type == _C_API_DTYPE_INT64:
-            out_indptr = _cint64_array_to_numpy(out_ptr_indptr, indptr_len)
+            out_indptr = _cint64_array_to_numpy(cptr=out_ptr_indptr, length=indptr_len)
         else:
             raise TypeError("Expected int32 or int64 type for indptr")
         if data_type == _C_API_DTYPE_FLOAT32:
-            out_data = _cfloat32_array_to_numpy(out_ptr_data, data_indices_len)
+            out_data = _cfloat32_array_to_numpy(cptr=out_ptr_data, length=data_indices_len)
         elif data_type == _C_API_DTYPE_FLOAT64:
-            out_data = _cfloat64_array_to_numpy(out_ptr_data, data_indices_len)
+            out_data = _cfloat64_array_to_numpy(cptr=out_ptr_data, length=data_indices_len)
         else:
             raise TypeError("Expected float32 or float64 type for data")
-        out_indices = _cint32_array_to_numpy(out_ptr_indices, data_indices_len)
+        out_indices = _cint32_array_to_numpy(cptr=out_ptr_indices, length=data_indices_len)
         # break up indptr based on number of rows (note more than one matrix in multiclass case)
         per_class_indptr_shape = cs.indptr.shape[0]
         # for CSC there is extra column added
@@ -1504,6 +1622,48 @@ def __pred_for_csc(
         if n_preds != out_num_preds.value:
             raise ValueError("Wrong length for predict results")
         return preds, nrow
+    
+    def __pred_for_pyarrow_table(
+        self,
+        table: pa_Table,
+        start_iteration: int,
+        num_iteration: int,
+        predict_type: int
+    ) -> Tuple[np.ndarray, int]:
+        """Predict for a PyArrow table."""
+        if not PYARROW_INSTALLED:
+            raise LightGBMError("Cannot predict from Arrow without `pyarrow` installed.")
+
+        # Check that the input is valid: we only handle numbers (for now)
+        if not all(arrow_is_integer(t) or arrow_is_floating(t) for t in table.schema.types):
+            raise ValueError("Arrow table may only have integer or floating point datatypes")
+
+        # Prepare prediction output array
+        n_preds = self.__get_num_preds(
+            start_iteration=start_iteration,
+            num_iteration=num_iteration,
+            nrow=table.num_rows,
+            predict_type=predict_type
+        )
+        preds = np.empty(n_preds, dtype=np.float64)
+        out_num_preds = ctypes.c_int64(0)
+
+        # Export Arrow table to C and run prediction
+        c_array = _export_arrow_to_c(table)
+        _safe_call(_LIB.LGBM_BoosterPredictForArrow(
+            self._handle,
+            ctypes.c_int64(c_array.n_chunks),
+            ctypes.c_void_p(c_array.chunks_ptr),
+            ctypes.c_void_p(c_array.schema_ptr),
+            ctypes.c_int(predict_type),
+            ctypes.c_int(start_iteration),
+            ctypes.c_int(num_iteration),
+            _c_str(self.pred_parameter),
+            ctypes.byref(out_num_preds),
+            preds.ctypes.data_as(ctypes.POINTER(ctypes.c_double))))
+        if n_preds != out_num_preds.value:
+            raise ValueError("Wrong length for predict results")
+        return preds, table.num_rows
 
     def current_iteration(self) -> int:
         """Get the index of the current iteration.
@@ -1541,26 +1701,26 @@ def __init__(
 
         Parameters
         ----------
-        data : str, pathlib.Path, numpy array, pandas DataFrame, H2O DataTable's Frame, scipy.sparse, Sequence, list of Sequence or list of numpy array
+        data : str, pathlib.Path, numpy array, pandas DataFrame, H2O DataTable's Frame, scipy.sparse, Sequence, list of Sequence, list of numpy array or pyarrow Table
             Data source of Dataset.
             If str or pathlib.Path, it represents the path to a text file (CSV, TSV, or LibSVM) or a LightGBM Dataset binary file.
-        label : list, numpy 1-D array, pandas Series / one-column DataFrame or None, optional (default=None)
+        label : list, numpy 1-D array, pandas Series / one-column DataFrame, pyarrow Array, pyarrow ChunkedArray or None, optional (default=None)
             Label of the data.
         reference : Dataset or None, optional (default=None)
             If this is Dataset for validation, training data should be used as reference.
-        weight : list, numpy 1-D array, pandas Series or None, optional (default=None)
+        weight : list, numpy 1-D array, pandas Series, pyarrow Array, pyarrow ChunkedArray or None, optional (default=None)
             Weight for each instance. Weights should be non-negative.
-        group : list, numpy 1-D array, pandas Series or None, optional (default=None)
+        group : list, numpy 1-D array, pandas Series, pyarrow Array, pyarrow ChunkedArray or None, optional (default=None)
             Group/query data.
             Only used in the learning-to-rank task.
             sum(group) = n_samples.
             For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups,
             where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc.
-        init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), or None, optional (default=None)
+        init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), pyarrow Array, pyarrow ChunkedArray, pyarrow Table (for multi-class task) or None, optional (default=None)
             Init score for Dataset.
         feature_name : list of str, or 'auto', optional (default="auto")
             Feature names.
-            If 'auto' and data is pandas DataFrame, data columns names are used.
+            If 'auto' and data is pandas DataFrame or pyarrow Table, data columns names are used.
         categorical_feature : list of str or int, or 'auto', optional (default="auto")
             Categorical features.
             If list of int, interpreted as indices.
@@ -1917,10 +2077,13 @@ def _lazy_init(
             self.__init_from_csc(data, params_str, ref_dataset)
         elif isinstance(data, np.ndarray):
             self.__init_from_np2d(data, params_str, ref_dataset)
+        elif _is_pyarrow_table(data):
+            self.__init_from_pyarrow_table(data, params_str, ref_dataset)
+            feature_name = data.column_names
         elif isinstance(data, list) and len(data) > 0:
-            if all(isinstance(x, np.ndarray) for x in data):
+            if _is_list_of_numpy_arrays(data):
                 self.__init_from_list_np2d(data, params_str, ref_dataset)
-            elif all(isinstance(x, Sequence) for x in data):
+            elif _is_list_of_sequences(data):
                 self.__init_from_seqs(data, ref_dataset)
             else:
                 raise TypeError('Data list can only be of ndarray or Sequence')
@@ -2177,10 +2340,36 @@ def __init_from_csc(
             ctypes.byref(self._handle)))
         return self
 
+    def __init_from_pyarrow_table(
+        self,
+        table: pa_Table,
+        params_str: str,
+        ref_dataset: Optional[_DatasetHandle]
+    ) -> "Dataset":
+        """Initialize data from a PyArrow table."""
+        if not PYARROW_INSTALLED:
+            raise LightGBMError("Cannot init dataframe from Arrow without `pyarrow` installed.")
+
+        # Check that the input is valid: we only handle numbers (for now)
+        if not all(arrow_is_integer(t) or arrow_is_floating(t) for t in table.schema.types):
+            raise ValueError("Arrow table may only have integer or floating point datatypes")
+
+        # Export Arrow table to C
+        c_array = _export_arrow_to_c(table)
+        self._handle = ctypes.c_void_p()
+        _safe_call(_LIB.LGBM_DatasetCreateFromArrow(
+            ctypes.c_int64(c_array.n_chunks),
+            ctypes.c_void_p(c_array.chunks_ptr),
+            ctypes.c_void_p(c_array.schema_ptr),
+            _c_str(params_str),
+            ref_dataset,
+            ctypes.byref(self._handle)))
+        return self
+
     @staticmethod
     def _compare_params_for_warning(
-        params: Optional[Dict[str, Any]],
-        other_params: Optional[Dict[str, Any]],
+        params: Dict[str, Any],
+        other_params: Dict[str, Any],
         ignore_keys: Set[str]
     ) -> bool:
         """Compare two dictionaries with params ignoring some keys.
@@ -2189,9 +2378,9 @@ def _compare_params_for_warning(
 
         Parameters
         ----------
-        params : dict or None
+        params : dict
             One dictionary with parameters to compare.
-        other_params : dict or None
+        other_params : dict
             Another dictionary with parameters to compare.
         ignore_keys : set
             Keys that should be ignored during comparing two dictionaries.
@@ -2201,10 +2390,6 @@ def _compare_params_for_warning(
         compare_result : bool
           Returns whether two dictionaries with params are equal.
         """
-        if params is None:
-            params = {}
-        if other_params is None:
-            other_params = {}
         for k in other_params:
             if k not in ignore_keys:
                 if k not in params or params[k] != other_params[k]:
@@ -2301,17 +2486,17 @@ def create_valid(
         data : str, pathlib.Path, numpy array, pandas DataFrame, H2O DataTable's Frame, scipy.sparse, Sequence, list of Sequence or list of numpy array
             Data source of Dataset.
             If str or pathlib.Path, it represents the path to a text file (CSV, TSV, or LibSVM) or a LightGBM Dataset binary file.
-        label : list, numpy 1-D array, pandas Series / one-column DataFrame or None, optional (default=None)
+        label : list, numpy 1-D array, pandas Series / one-column DataFrame, pyarrow Array, pyarrow ChunkedArray or None, optional (default=None)
             Label of the data.
-        weight : list, numpy 1-D array, pandas Series or None, optional (default=None)
+        weight : list, numpy 1-D array, pandas Series, pyarrow Array, pyarrow ChunkedArray or None, optional (default=None)
             Weight for each instance. Weights should be non-negative.
-        group : list, numpy 1-D array, pandas Series or None, optional (default=None)
+        group : list, numpy 1-D array, pandas Series, pyarrow Array, pyarrow ChunkedArray or None, optional (default=None)
             Group/query data.
             Only used in the learning-to-rank task.
             sum(group) = n_samples.
             For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups,
             where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc.
-        init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), or None, optional (default=None)
+        init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), pyarrow Array, pyarrow ChunkedArray, pyarrow Table (for multi-class task) or None, optional (default=None)
             Init score for Dataset.
         params : dict or None, optional (default=None)
             Other parameters for validation Dataset.
@@ -2418,7 +2603,7 @@ def _reverse_update_params(self) -> "Dataset":
     def set_field(
         self,
         field_name: str,
-        data: Optional[Union[List[List[float]], List[List[int]], List[float], List[int], np.ndarray, pd_Series, pd_DataFrame]]
+        data: Optional[Union[List[List[float]], List[List[int]], List[float], List[int], np.ndarray, pd_Series, pd_DataFrame, pa_Table, pa_Array, pa_ChunkedArray]]
     ) -> "Dataset":
         """Set property into the Dataset.
 
@@ -2426,7 +2611,7 @@ def set_field(
         ----------
         field_name : str
             The field name of the information.
-        data : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), or None
+        data : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), pyarrow Array, pyarrow ChunkedArray or None
             The data to be set.
 
         Returns
@@ -2445,6 +2630,29 @@ def set_field(
                 ctypes.c_int(0),
                 ctypes.c_int(_FIELD_TYPE_MAPPER[field_name])))
             return self
+
+        # If the data is a arrow data, we can just pass it to C
+        if _is_pyarrow_array(data) or _is_pyarrow_table(data):
+            # If a table is being passed, we concatenate the columns. This is only valid for
+            # 'init_score'.
+            if _is_pyarrow_table(data):
+                if field_name != "init_score":
+                    raise ValueError(f"pyarrow tables are not supported for field '{field_name}'")
+                data = pa_chunked_array([
+                    chunk for array in data.columns for chunk in array.chunks  # type: ignore
+                ])
+
+            c_array = _export_arrow_to_c(data)
+            _safe_call(_LIB.LGBM_DatasetSetFieldFromArrow(
+                self._handle,
+                _c_str(field_name),
+                ctypes.c_int64(c_array.n_chunks),
+                ctypes.c_void_p(c_array.chunks_ptr),
+                ctypes.c_void_p(c_array.schema_ptr),
+            ))
+            self.version += 1
+            return self
+
         dtype: "np.typing.DTypeLike"
         if field_name == 'init_score':
             dtype = np.float64
@@ -2483,6 +2691,12 @@ def set_field(
     def get_field(self, field_name: str) -> Optional[np.ndarray]:
         """Get property from the Dataset.
 
+        Can only be run on a constructed Dataset.
+
+        Unlike ``get_group()``, ``get_init_score()``, ``get_label()``, ``get_position()``, and ``get_weight()``,
+        this method ignores any raw data passed into ``lgb.Dataset()`` on the Python side, and will only read
+        data from the constructed C++ ``Dataset`` object.
+
         Parameters
         ----------
         field_name : str
@@ -2509,11 +2723,20 @@ def get_field(self, field_name: str) -> Optional[np.ndarray]:
         if tmp_out_len.value == 0:
             return None
         if out_type.value == _C_API_DTYPE_INT32:
-            arr = _cint32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_int32)), tmp_out_len.value)
+            arr = _cint32_array_to_numpy(
+                cptr=ctypes.cast(ret, ctypes.POINTER(ctypes.c_int32)),
+                length=tmp_out_len.value
+            )
         elif out_type.value == _C_API_DTYPE_FLOAT32:
-            arr = _cfloat32_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_float)), tmp_out_len.value)
+            arr = _cfloat32_array_to_numpy(
+                cptr=ctypes.cast(ret, ctypes.POINTER(ctypes.c_float)),
+                length=tmp_out_len.value
+            )
         elif out_type.value == _C_API_DTYPE_FLOAT64:
-            arr = _cfloat64_array_to_numpy(ctypes.cast(ret, ctypes.POINTER(ctypes.c_double)), tmp_out_len.value)
+            arr = _cfloat64_array_to_numpy(
+                cptr=ctypes.cast(ret, ctypes.POINTER(ctypes.c_double)),
+                length=tmp_out_len.value
+            )
         else:
             raise TypeError("Unknown type")
         if field_name == 'init_score':
@@ -2646,7 +2869,7 @@ def set_label(self, label: Optional[_LGBM_LabelType]) -> "Dataset":
 
         Parameters
         ----------
-        label : list, numpy 1-D array, pandas Series / one-column DataFrame or None
+        label : list, numpy 1-D array, pandas Series / one-column DataFrame, pyarrow Array, pyarrow ChunkedArray or None
             The label information to be set into Dataset.
 
         Returns
@@ -2659,18 +2882,9 @@ def set_label(self, label: Optional[_LGBM_LabelType]) -> "Dataset":
             if isinstance(label, pd_DataFrame):
                 if len(label.columns) > 1:
                     raise ValueError('DataFrame for label cannot have multiple columns')
-                _check_for_bad_pandas_dtypes(label.dtypes)
-                try:
-                    # most common case (no nullable dtypes)
-                    label = label.to_numpy(dtype=np.float32, copy=False)
-                except TypeError:
-                    # 1.0 <= pd version < 1.1 and nullable dtypes, least common case
-                    # raises error because array is casted to type(pd.NA) and there's no na_value argument
-                    label = label.astype(np.float32, copy=False).values
-                except ValueError:
-                    # data has nullable dtypes, but we can specify na_value argument and copy will be made
-                    label = label.to_numpy(dtype=np.float32, na_value=np.nan)
-                label_array = np.ravel(label)
+                label_array = np.ravel(_pandas_to_numpy(label, target_dtype=np.float32))
+            elif _is_pyarrow_array(label):
+                label_array = label
             else:
                 label_array = _list_to_1d_numpy(label, dtype=np.float32, name='label')
             self.set_field('label', label_array)
@@ -2685,7 +2899,7 @@ def set_weight(
 
         Parameters
         ----------
-        weight : list, numpy 1-D array, pandas Series or None
+        weight : list, numpy 1-D array, pandas Series, pyarrow Array, pyarrow ChunkedArray or None
             Weight to be set for each data point. Weights should be non-negative.
 
         Returns
@@ -2693,11 +2907,19 @@ def set_weight(
         self : Dataset
             Dataset with set weight.
         """
-        if weight is not None and np.all(weight == 1):
-            weight = None
+        # Check if the weight contains values other than one
+        if weight is not None:
+            if _is_pyarrow_array(weight):
+                if pa_compute.all(pa_compute.equal(weight, 1)).as_py():
+                    weight = None
+            elif np.all(weight == 1):
+                weight = None
         self.weight = weight
+
+        # Set field
         if self._handle is not None and weight is not None:
-            weight = _list_to_1d_numpy(weight, dtype=np.float32, name='weight')
+            if not _is_pyarrow_array(weight):
+                weight = _list_to_1d_numpy(weight, dtype=np.float32, name='weight')
             self.set_field('weight', weight)
             self.weight = self.get_field('weight')  # original values can be modified at cpp side
         return self
@@ -2710,7 +2932,7 @@ def set_init_score(
 
         Parameters
         ----------
-        init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), or None
+        init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), pyarrow Array, pyarrow ChunkedArray, pyarrow Table (for multi-class task) or None
             Init score for Booster.
 
         Returns
@@ -2732,7 +2954,7 @@ def set_group(
 
         Parameters
         ----------
-        group : list, numpy 1-D array, pandas Series or None
+        group : list, numpy 1-D array, pandas Series, pyarrow Array, pyarrow ChunkedArray or None
             Group/query data.
             Only used in the learning-to-rank task.
             sum(group) = n_samples.
@@ -2746,8 +2968,13 @@ def set_group(
         """
         self.group = group
         if self._handle is not None and group is not None:
-            group = _list_to_1d_numpy(group, dtype=np.int32, name='group')
+            if not _is_pyarrow_array(group):
+                group = _list_to_1d_numpy(group, dtype=np.int32, name='group')
             self.set_field('group', group)
+            # original values can be modified at cpp side
+            constructed_group = self.get_field('group')
+            if constructed_group is not None:
+                self.group = np.diff(constructed_group)
         return self
 
     def set_position(
@@ -2787,7 +3014,7 @@ def get_feature_name(self) -> List[str]:
         reserved_string_buffer_size = 255
         required_string_buffer_size = ctypes.c_size_t(0)
         string_buffers = [ctypes.create_string_buffer(reserved_string_buffer_size) for _ in range(num_feature)]
-        ptr_string_buffers = (ctypes.c_char_p * num_feature)(*map(ctypes.addressof, string_buffers))
+        ptr_string_buffers = (ctypes.c_char_p * num_feature)(*map(ctypes.addressof, string_buffers))  # type: ignore[misc]
         _safe_call(_LIB.LGBM_DatasetGetFeatureNames(
             self._handle,
             ctypes.c_int(num_feature),
@@ -2801,7 +3028,7 @@ def get_feature_name(self) -> List[str]:
         # if buffer length is not long enough, reallocate buffers
         if reserved_string_buffer_size < actual_string_buffer_size:
             string_buffers = [ctypes.create_string_buffer(actual_string_buffer_size) for _ in range(num_feature)]
-            ptr_string_buffers = (ctypes.c_char_p * num_feature)(*map(ctypes.addressof, string_buffers))
+            ptr_string_buffers = (ctypes.c_char_p * num_feature)(*map(ctypes.addressof, string_buffers))  # type: ignore[misc]
             _safe_call(_LIB.LGBM_DatasetGetFeatureNames(
                 self._handle,
                 ctypes.c_int(num_feature),
@@ -2811,37 +3038,40 @@ def get_feature_name(self) -> List[str]:
                 ptr_string_buffers))
         return [string_buffers[i].value.decode('utf-8') for i in range(num_feature)]
 
-    def get_label(self) -> Optional[np.ndarray]:
+    def get_label(self) -> Optional[_LGBM_LabelType]:
         """Get the label of the Dataset.
 
         Returns
         -------
-        label : numpy array or None
+        label : list, numpy 1-D array, pandas Series / one-column DataFrame or None
             The label information from the Dataset.
+            For a constructed ``Dataset``, this will only return a numpy array.
         """
         if self.label is None:
             self.label = self.get_field('label')
         return self.label
 
-    def get_weight(self) -> Optional[np.ndarray]:
+    def get_weight(self) -> Optional[_LGBM_WeightType]:
         """Get the weight of the Dataset.
 
         Returns
         -------
-        weight : numpy array or None
+        weight : list, numpy 1-D array, pandas Series or None
             Weight for each data point from the Dataset. Weights should be non-negative.
+            For a constructed ``Dataset``, this will only return ``None`` or a numpy array.
         """
         if self.weight is None:
             self.weight = self.get_field('weight')
         return self.weight
 
-    def get_init_score(self) -> Optional[np.ndarray]:
+    def get_init_score(self) -> Optional[_LGBM_InitScoreType]:
         """Get the initial score of the Dataset.
 
         Returns
         -------
-        init_score : numpy array or None
+        init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), or None
             Init score of Booster.
+            For a constructed ``Dataset``, this will only return ``None`` or a numpy array.
         """
         if self.init_score is None:
             self.init_score = self.get_field('init_score')
@@ -2868,7 +3098,7 @@ def get_data(self) -> Optional[_LGBM_TrainDataType]:
                     self.data = self.data[self.used_indices, :]
                 elif isinstance(self.data, Sequence):
                     self.data = self.data[self.used_indices]
-                elif isinstance(self.data, list) and len(self.data) > 0 and all(isinstance(x, Sequence) for x in self.data):
+                elif _is_list_of_sequences(self.data) and len(self.data) > 0:
                     self.data = np.array(list(self._yield_row_from_seqlist(self.data, self.used_indices)))
                 else:
                     _log_warning(f"Cannot subset {type(self.data).__name__} type of raw data.\n"
@@ -2879,17 +3109,18 @@ def get_data(self) -> Optional[_LGBM_TrainDataType]:
                                 "set free_raw_data=False when construct Dataset to avoid this.")
         return self.data
 
-    def get_group(self) -> Optional[np.ndarray]:
+    def get_group(self) -> Optional[_LGBM_GroupType]:
         """Get the group of the Dataset.
 
         Returns
         -------
-        group : numpy array or None
+        group : list, numpy 1-D array, pandas Series or None
             Group/query data.
             Only used in the learning-to-rank task.
             sum(group) = n_samples.
             For example, if you have a 100-document dataset with ``group = [10, 20, 40, 10, 10, 10]``, that means that you have 6 groups,
             where the first 10 records are in the first group, records 11-30 are in the second group, records 31-70 are in the third group, etc.
+            For a constructed ``Dataset``, this will only return ``None`` or a numpy array.
         """
         if self.group is None:
             self.group = self.get_field('group')
@@ -2898,13 +3129,14 @@ def get_group(self) -> Optional[np.ndarray]:
                 self.group = np.diff(self.group)
         return self.group
 
-    def get_position(self) -> Optional[np.ndarray]:
+    def get_position(self) -> Optional[_LGBM_PositionType]:
         """Get the position of the Dataset.
 
         Returns
         -------
-        position : numpy 1-D array or None
+        position : numpy 1-D array, pandas Series or None
             Position of items used in unbiased learning-to-rank task.
+            For a constructed ``Dataset``, this will only return ``None`` or a numpy array.
         """
         if self.position is None:
             self.position = self.get_field('position')
@@ -3292,7 +3524,7 @@ def _get_loaded_param(self) -> Dict[str, Any]:
         buffer_len = 1 << 20
         tmp_out_len = ctypes.c_int64(0)
         string_buffer = ctypes.create_string_buffer(buffer_len)
-        ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)])
+        ptr_string_buffer = ctypes.c_char_p(ctypes.addressof(string_buffer))
         _safe_call(_LIB.LGBM_BoosterGetLoadedParam(
             self._handle,
             ctypes.c_int64(buffer_len),
@@ -3302,7 +3534,7 @@ def _get_loaded_param(self) -> Dict[str, Any]:
         # if buffer length is not long enough, re-allocate a buffer
         if actual_len > buffer_len:
             string_buffer = ctypes.create_string_buffer(actual_len)
-            ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)])
+            ptr_string_buffer = ctypes.c_char_p(ctypes.addressof(string_buffer))
             _safe_call(_LIB.LGBM_BoosterGetLoadedParam(
                 self._handle,
                 ctypes.c_int64(actual_len),
@@ -4055,7 +4287,7 @@ def model_to_string(
         buffer_len = 1 << 20
         tmp_out_len = ctypes.c_int64(0)
         string_buffer = ctypes.create_string_buffer(buffer_len)
-        ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)])
+        ptr_string_buffer = ctypes.c_char_p(ctypes.addressof(string_buffer))
         _safe_call(_LIB.LGBM_BoosterSaveModelToString(
             self._handle,
             ctypes.c_int(start_iteration),
@@ -4068,7 +4300,7 @@ def model_to_string(
         # if buffer length is not long enough, re-allocate a buffer
         if actual_len > buffer_len:
             string_buffer = ctypes.create_string_buffer(actual_len)
-            ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)])
+            ptr_string_buffer = ctypes.c_char_p(ctypes.addressof(string_buffer))
             _safe_call(_LIB.LGBM_BoosterSaveModelToString(
                 self._handle,
                 ctypes.c_int(start_iteration),
@@ -4123,7 +4355,7 @@ def dump_model(
         buffer_len = 1 << 20
         tmp_out_len = ctypes.c_int64(0)
         string_buffer = ctypes.create_string_buffer(buffer_len)
-        ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)])
+        ptr_string_buffer = ctypes.c_char_p(ctypes.addressof(string_buffer))
         _safe_call(_LIB.LGBM_BoosterDumpModel(
             self._handle,
             ctypes.c_int(start_iteration),
@@ -4136,7 +4368,7 @@ def dump_model(
         # if buffer length is not long enough, reallocate a buffer
         if actual_len > buffer_len:
             string_buffer = ctypes.create_string_buffer(actual_len)
-            ptr_string_buffer = ctypes.c_char_p(*[ctypes.addressof(string_buffer)])
+            ptr_string_buffer = ctypes.c_char_p(ctypes.addressof(string_buffer))
             _safe_call(_LIB.LGBM_BoosterDumpModel(
                 self._handle,
                 ctypes.c_int(start_iteration),
@@ -4166,7 +4398,7 @@ def predict(
 
         Parameters
         ----------
-        data : str, pathlib.Path, numpy array, pandas DataFrame, H2O DataTable's Frame or scipy.sparse
+        data : str, pathlib.Path, numpy array, pandas DataFrame, pyarrow Table, H2O DataTable's Frame or scipy.sparse
             Data source for prediction.
             If str or pathlib.Path, it represents the path to a text file (CSV, TSV, or LibSVM).
         start_iteration : int, optional (default=0)
@@ -4250,7 +4482,7 @@ def refit(
         data : str, pathlib.Path, numpy array, pandas DataFrame, H2O DataTable's Frame, scipy.sparse, Sequence, list of Sequence or list of numpy array
             Data source for refit.
             If str or pathlib.Path, it represents the path to a text file (CSV, TSV, or LibSVM).
-        label : list, numpy 1-D array or pandas Series / one-column DataFrame
+        label : list, numpy 1-D array, pandas Series / one-column DataFrame, pyarrow Array or pyarrow ChunkedArray
             Label for refit.
         decay_rate : float, optional (default=0.9)
             Decay rate of refit,
@@ -4260,12 +4492,12 @@ def refit(
 
             .. versionadded:: 4.0.0
 
-        weight : list, numpy 1-D array, pandas Series or None, optional (default=None)
+        weight : list, numpy 1-D array, pandas Series, pyarrow Array, pyarrow ChunkedArray or None, optional (default=None)
             Weight for each ``data`` instance. Weights should be non-negative.
 
             .. versionadded:: 4.0.0
 
-        group : list, numpy 1-D array, pandas Series or None, optional (default=None)
+        group : list, numpy 1-D array, pandas Series, pyarrow Array, pyarrow ChunkedArray or None, optional (default=None)
             Group/query size for ``data``.
             Only used in the learning-to-rank task.
             sum(group) = n_samples.
@@ -4274,7 +4506,7 @@ def refit(
 
             .. versionadded:: 4.0.0
 
-        init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), or None, optional (default=None)
+        init_score : list, list of lists (for multi-class task), numpy array, pandas Series, pandas DataFrame (for multi-class task), pyarrow Array, pyarrow ChunkedArray, pyarrow Table (for multi-class task) or None, optional (default=None)
             Init score for ``data``.
 
             .. versionadded:: 4.0.0
@@ -4462,7 +4694,7 @@ def feature_name(self) -> List[str]:
         reserved_string_buffer_size = 255
         required_string_buffer_size = ctypes.c_size_t(0)
         string_buffers = [ctypes.create_string_buffer(reserved_string_buffer_size) for _ in range(num_feature)]
-        ptr_string_buffers = (ctypes.c_char_p * num_feature)(*map(ctypes.addressof, string_buffers))
+        ptr_string_buffers = (ctypes.c_char_p * num_feature)(*map(ctypes.addressof, string_buffers))  # type: ignore[misc]
         _safe_call(_LIB.LGBM_BoosterGetFeatureNames(
             self._handle,
             ctypes.c_int(num_feature),
@@ -4476,7 +4708,7 @@ def feature_name(self) -> List[str]:
         # if buffer length is not long enough, reallocate buffers
         if reserved_string_buffer_size < actual_string_buffer_size:
             string_buffers = [ctypes.create_string_buffer(actual_string_buffer_size) for _ in range(num_feature)]
-            ptr_string_buffers = (ctypes.c_char_p * num_feature)(*map(ctypes.addressof, string_buffers))
+            ptr_string_buffers = (ctypes.c_char_p * num_feature)(*map(ctypes.addressof, string_buffers))  # type: ignore[misc]
             _safe_call(_LIB.LGBM_BoosterGetFeatureNames(
                 self._handle,
                 ctypes.c_int(num_feature),
@@ -4686,7 +4918,7 @@ def __get_eval_info(self) -> None:
                 string_buffers = [
                     ctypes.create_string_buffer(reserved_string_buffer_size) for _ in range(self.__num_inner_eval)
                 ]
-                ptr_string_buffers = (ctypes.c_char_p * self.__num_inner_eval)(*map(ctypes.addressof, string_buffers))
+                ptr_string_buffers = (ctypes.c_char_p * self.__num_inner_eval)(*map(ctypes.addressof, string_buffers))  # type: ignore[misc]
                 _safe_call(_LIB.LGBM_BoosterGetEvalNames(
                     self._handle,
                     ctypes.c_int(self.__num_inner_eval),
@@ -4702,7 +4934,7 @@ def __get_eval_info(self) -> None:
                     string_buffers = [
                         ctypes.create_string_buffer(actual_string_buffer_size) for _ in range(self.__num_inner_eval)
                     ]
-                    ptr_string_buffers = (ctypes.c_char_p * self.__num_inner_eval)(*map(ctypes.addressof, string_buffers))
+                    ptr_string_buffers = (ctypes.c_char_p * self.__num_inner_eval)(*map(ctypes.addressof, string_buffers))  # type: ignore[misc]
                     _safe_call(_LIB.LGBM_BoosterGetEvalNames(
                         self._handle,
                         ctypes.c_int(self.__num_inner_eval),
diff --git a/python-package/lightgbm/callback.py b/python-package/lightgbm/callback.py
index 2f77ee740c75..b68bb63c7f41 100644
--- a/python-package/lightgbm/callback.py
+++ b/python-package/lightgbm/callback.py
@@ -229,7 +229,12 @@ def __call__(self, env: CallbackEnv) -> None:
             if new_param != env.params.get(key, None):
                 new_parameters[key] = new_param
         if new_parameters:
-            env.model.reset_parameter(new_parameters)
+            if isinstance(env.model, Booster):
+                env.model.reset_parameter(new_parameters)
+            else:
+                # CVBooster holds a list of Booster objects, each needs to be updated
+                for booster in env.model.boosters:
+                    booster.reset_parameter(new_parameters)
             env.params.update(new_parameters)
 
 
@@ -267,6 +272,10 @@ def __init__(
         verbose: bool = True,
         min_delta: Union[float, List[float]] = 0.0
     ) -> None:
+
+        if not isinstance(stopping_rounds, int) or stopping_rounds <= 0:
+            raise ValueError(f"stopping_rounds should be an integer and greater than 0. got: {stopping_rounds}")
+
         self.order = 30
         self.before_iteration = False
 
@@ -291,32 +300,45 @@ def _gt_delta(self, curr_score: float, best_score: float, delta: float) -> bool:
     def _lt_delta(self, curr_score: float, best_score: float, delta: float) -> bool:
         return curr_score < best_score - delta
 
-    def _is_train_set(self, ds_name: str, eval_name: str, train_name: str) -> bool:
-        return (ds_name == "cv_agg" and eval_name == "train") or ds_name == train_name
+    def _is_train_set(self, ds_name: str, eval_name: str, env: CallbackEnv) -> bool:
+        """Check, by name, if a given Dataset is the training data."""
+        # for lgb.cv() with eval_train_metric=True, evaluation is also done on the training set
+        # and those metrics are considered for early stopping
+        if ds_name == "cv_agg" and eval_name == "train":
+            return True
+
+        # for lgb.train(), it's possible to pass the training data via valid_sets with any eval_name
+        if isinstance(env.model, Booster) and ds_name == env.model._train_data_name:
+            return True
+
+        return False
 
     def _init(self, env: CallbackEnv) -> None:
         if env.evaluation_result_list is None or env.evaluation_result_list == []:
             raise ValueError(
                 "For early stopping, at least one dataset and eval metric is required for evaluation"
             )
+
         is_dart = any(env.params.get(alias, "") == 'dart' for alias in _ConfigAliases.get("boosting"))
-        only_train_set = (
-            len(env.evaluation_result_list) == 1
-            and self._is_train_set(
-                ds_name=env.evaluation_result_list[0][0],
-                eval_name=env.evaluation_result_list[0][1].split(" ")[0],
-                train_name=env.model._train_data_name)
-        )
-        self.enabled = not is_dart and not only_train_set
-        if not self.enabled:
-            if is_dart:
-                _log_warning('Early stopping is not available in dart mode')
-            elif only_train_set:
-                _log_warning('Only training set found, disabling early stopping.')
+        if is_dart:
+            self.enabled = False
+            _log_warning('Early stopping is not available in dart mode')
             return
 
-        if self.stopping_rounds <= 0:
-            raise ValueError("stopping_rounds should be greater than zero.")
+        # validation sets are guaranteed to not be identical to the training data in cv()
+        if isinstance(env.model, Booster):
+            only_train_set = (
+                len(env.evaluation_result_list) == 1
+                and self._is_train_set(
+                    ds_name=env.evaluation_result_list[0][0],
+                    eval_name=env.evaluation_result_list[0][1].split(" ")[0],
+                    env=env
+                )
+            )
+            if only_train_set:
+                self.enabled = False
+                _log_warning('Only training set found, disabling early stopping.')
+                return
 
         if self.verbose:
             _log_info(f"Training until validation scores don't improve for {self.stopping_rounds} rounds")
@@ -395,7 +417,11 @@ def __call__(self, env: CallbackEnv) -> None:
             eval_name_splitted = env.evaluation_result_list[i][1].split(" ")
             if self.first_metric_only and self.first_metric != eval_name_splitted[-1]:
                 continue  # use only the first metric for early stopping
-            if self._is_train_set(env.evaluation_result_list[i][0], eval_name_splitted[0], env.model._train_data_name):
+            if self._is_train_set(
+                ds_name=env.evaluation_result_list[i][0],
+                eval_name=eval_name_splitted[0],
+                env=env
+            ):
                 continue  # train data for lgb.cv or sklearn wrapper (underlying lgb.train)
             elif env.iteration - self.best_iter[i] >= self.stopping_rounds:
                 if self.verbose:
diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py
index 0a55ccd1e421..bd1b29a1e802 100644
--- a/python-package/lightgbm/compat.py
+++ b/python-package/lightgbm/compat.py
@@ -36,6 +36,16 @@ def __init__(self, *args, **kwargs):
 
     concat = None
 
+"""numpy"""
+try:
+    from numpy.random import Generator as np_random_Generator
+except ImportError:
+    class np_random_Generator:  # type: ignore
+        """Dummy class for np.random.Generator."""
+
+        def __init__(self, *args, **kwargs):
+            pass
+
 """matplotlib"""
 try:
     import matplotlib  # noqa: F401
@@ -185,6 +195,59 @@ class dask_Series:  # type: ignore
         def __init__(self, *args, **kwargs):
             pass
 
+"""pyarrow"""
+try:
+    import pyarrow.compute as pa_compute
+    from pyarrow import Array as pa_Array
+    from pyarrow import ChunkedArray as pa_ChunkedArray
+    from pyarrow import Table as pa_Table
+    from pyarrow import chunked_array as pa_chunked_array
+    from pyarrow.cffi import ffi as arrow_cffi
+    from pyarrow.types import is_floating as arrow_is_floating
+    from pyarrow.types import is_integer as arrow_is_integer
+    PYARROW_INSTALLED = True
+except ImportError:
+    PYARROW_INSTALLED = False
+
+    class pa_Array:  # type: ignore
+        """Dummy class for pa.Array."""
+
+        def __init__(self, *args, **kwargs):
+            pass
+
+    class pa_ChunkedArray:  # type: ignore
+        """Dummy class for pa.ChunkedArray."""
+
+        def __init__(self, *args, **kwargs):
+            pass
+
+    class pa_Table:  # type: ignore
+        """Dummy class for pa.Table."""
+
+        def __init__(self, *args, **kwargs):
+            pass
+
+    class arrow_cffi:  # type: ignore
+        """Dummy class for pyarrow.cffi.ffi."""
+
+        CData = None
+        addressof = None
+        cast = None
+        new = None
+
+        def __init__(self, *args, **kwargs):
+            pass
+
+    class pa_compute:  # type: ignore
+        """Dummy class for pyarrow.compute."""
+
+        all = None
+        equal = None
+
+    pa_chunked_array = None
+    arrow_is_integer = None
+    arrow_is_floating = None
+
 """cpu_count()"""
 try:
     from joblib import cpu_count
diff --git a/python-package/lightgbm/dask.py b/python-package/lightgbm/dask.py
index ce5f7137b20c..094d49223952 100644
--- a/python-package/lightgbm/dask.py
+++ b/python-package/lightgbm/dask.py
@@ -1142,7 +1142,7 @@ def __init__(
         colsample_bytree: float = 1.,
         reg_alpha: float = 0.,
         reg_lambda: float = 0.,
-        random_state: Optional[Union[int, np.random.RandomState]] = None,
+        random_state: Optional[Union[int, np.random.RandomState, 'np.random.Generator']] = None,
         n_jobs: Optional[int] = None,
         importance_type: str = 'split',
         validation_fraction: Optional[float] = 0.1,
@@ -1348,7 +1348,7 @@ def __init__(
         colsample_bytree: float = 1.,
         reg_alpha: float = 0.,
         reg_lambda: float = 0.,
-        random_state: Optional[Union[int, np.random.RandomState]] = None,
+        random_state: Optional[Union[int, np.random.RandomState, 'np.random.Generator']] = None,
         n_jobs: Optional[int] = None,
         importance_type: str = 'split',
         validation_fraction: Optional[float] = 0.1,
@@ -1519,7 +1519,7 @@ def __init__(
         colsample_bytree: float = 1.,
         reg_alpha: float = 0.,
         reg_lambda: float = 0.,
-        random_state: Optional[Union[int, np.random.RandomState]] = None,
+        random_state: Optional[Union[int, np.random.RandomState, 'np.random.Generator']] = None,
         n_jobs: Optional[int] = None,
         importance_type: str = 'split',
         validation_fraction: Optional[float] = 0.1,
diff --git a/python-package/lightgbm/libpath.py b/python-package/lightgbm/libpath.py
index c096a6f1b5e2..21222228b0c2 100644
--- a/python-package/lightgbm/libpath.py
+++ b/python-package/lightgbm/libpath.py
@@ -16,8 +16,7 @@ def find_lib_path() -> List[str]:
        List of all found library paths to LightGBM.
     """
     curr_path = Path(__file__).absolute()
-    dll_path = [curr_path,
-                curr_path.parents[1],
+    dll_path = [curr_path.parents[1],
                 curr_path.parents[0] / 'bin',
                 curr_path.parents[0] / 'lib']
     if system() in ('Windows', 'Microsoft'):
diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py
index f1a132637aa1..64be9e55e8af 100644
--- a/python-package/lightgbm/sklearn.py
+++ b/python-package/lightgbm/sklearn.py
@@ -15,8 +15,8 @@
 from .compat import (SKLEARN_INSTALLED, LGBMNotFittedError, _LGBMAssertAllFinite, _LGBMCheckArray,
                      _LGBMCheckClassificationTargets, _LGBMCheckSampleWeight, _LGBMCheckXY, _LGBMClassifierBase,
                      _LGBMComputeSampleWeight, _LGBMCpuCount, _LGBMLabelEncoder, _LGBMModelBase, _LGBMRegressorBase,
-                     dt_DataTable, pd_DataFrame)
-from .engine import _make_n_folds, train
+                     dt_DataTable, np_random_Generator, pd_DataFrame)
+from .engine import train
 
 __all__ = [
     'LGBMClassifier',
@@ -86,6 +86,36 @@
 _LGBM_ScikitValidSet = Tuple[_LGBM_ScikitMatrixLike, _LGBM_LabelType]
 
 
+def _get_group_from_constructed_dataset(dataset: Dataset) -> Optional[np.ndarray]:
+    group = dataset.get_group()
+    error_msg = (
+        "Estimators in lightgbm.sklearn should only retrieve query groups from a constructed Dataset. "
+        "If you're seeing this message, it's a bug in lightgbm. Please report it at https://github.com/microsoft/LightGBM/issues."
+    )
+    assert (group is None or isinstance(group, np.ndarray)), error_msg
+    return group
+
+
+def _get_label_from_constructed_dataset(dataset: Dataset) -> np.ndarray:
+    label = dataset.get_label()
+    error_msg = (
+        "Estimators in lightgbm.sklearn should only retrieve labels from a constructed Dataset. "
+        "If you're seeing this message, it's a bug in lightgbm. Please report it at https://github.com/microsoft/LightGBM/issues."
+    )
+    assert isinstance(label, np.ndarray), error_msg
+    return label
+
+
+def _get_weight_from_constructed_dataset(dataset: Dataset) -> Optional[np.ndarray]:
+    weight = dataset.get_weight()
+    error_msg = (
+        "Estimators in lightgbm.sklearn should only retrieve weights from a constructed Dataset. "
+        "If you're seeing this message, it's a bug in lightgbm. Please report it at https://github.com/microsoft/LightGBM/issues."
+    )
+    assert (weight is None or isinstance(weight, np.ndarray)), error_msg
+    return weight
+
+
 class _ObjectiveFunctionWrapper:
     """Proxy class for objective function."""
 
@@ -151,17 +181,22 @@ def __call__(self, preds: np.ndarray, dataset: Dataset) -> Tuple[np.ndarray, np.
             The value of the second order derivative (Hessian) of the loss
             with respect to the elements of preds for each sample point.
         """
-        labels = dataset.get_label()
+        labels = _get_label_from_constructed_dataset(dataset)
         argc = len(signature(self.func).parameters)
         if argc == 2:
             grad, hess = self.func(labels, preds)  # type: ignore[call-arg]
-        elif argc == 3:
-            grad, hess = self.func(labels, preds, dataset.get_weight())  # type: ignore[call-arg]
-        elif argc == 4:
-            grad, hess = self.func(labels, preds, dataset.get_weight(), dataset.get_group())  # type: ignore [call-arg]
-        else:
-            raise TypeError(f"Self-defined objective function should have 2, 3 or 4 arguments, got {argc}")
-        return grad, hess
+            return grad, hess
+
+        weight = _get_weight_from_constructed_dataset(dataset)
+        if argc == 3:
+            grad, hess = self.func(labels, preds, weight)  # type: ignore[call-arg]
+            return grad, hess
+
+        if argc == 4:
+            group = _get_group_from_constructed_dataset(dataset)
+            return self.func(labels, preds, weight, group)  # type: ignore[call-arg]
+
+        raise TypeError(f"Self-defined objective function should have 2, 3 or 4 arguments, got {argc}")
 
 
 class _EvalFunctionWrapper:
@@ -229,16 +264,20 @@ def __call__(
         is_higher_better : bool
             Is eval result higher better, e.g. AUC is ``is_higher_better``.
         """
-        labels = dataset.get_label()
+        labels = _get_label_from_constructed_dataset(dataset)
         argc = len(signature(self.func).parameters)
         if argc == 2:
             return self.func(labels, preds)  # type: ignore[call-arg]
-        elif argc == 3:
-            return self.func(labels, preds, dataset.get_weight())  # type: ignore[call-arg]
-        elif argc == 4:
-            return self.func(labels, preds, dataset.get_weight(), dataset.get_group())  # type: ignore[call-arg]
-        else:
-            raise TypeError(f"Self-defined eval function should have 2, 3 or 4 arguments, got {argc}")
+
+        weight = _get_weight_from_constructed_dataset(dataset)
+        if argc == 3:
+            return self.func(labels, preds, weight)  # type: ignore[call-arg]
+
+        if argc == 4:
+            group = _get_group_from_constructed_dataset(dataset)
+            return self.func(labels, preds, weight, group)  # type: ignore[call-arg]
+
+        raise TypeError(f"Self-defined eval function should have 2, 3 or 4 arguments, got {argc}")
 
 
 # documentation templates for LGBMModel methods are shared between the classes in
@@ -409,7 +448,7 @@ def __init__(
         colsample_bytree: float = 1.,
         reg_alpha: float = 0.,
         reg_lambda: float = 0.,
-        random_state: Optional[Union[int, np.random.RandomState]] = None,
+        random_state: Optional[Union[int, np.random.RandomState, 'np.random.Generator']] = None,
         n_jobs: Optional[int] = None,
         importance_type: str = 'split',
         validation_fraction: Optional[float] = 0.1,
@@ -471,7 +510,7 @@ def __init__(
         random_state : int, RandomState object or None, optional (default=None)
             Random number seed.
             If int, this number is used to seed the C++ code.
-            If RandomState object (numpy), a random integer is picked based on its state to seed the C++ code.
+            If RandomState or Generator object (numpy), a random integer is picked based on its state to seed the C++ code.
             If None, default seeds in C++ code are used.
         n_jobs : int or None, optional (default=None)
             Number of parallel threads to use for training (can be changed at prediction time by
@@ -678,6 +717,10 @@ def _process_params(self, stage: str) -> Dict[str, Any]:
 
         if isinstance(params['random_state'], np.random.RandomState):
             params['random_state'] = params['random_state'].randint(np.iinfo(np.int32).max)
+        elif isinstance(params['random_state'], np_random_Generator):
+            params['random_state'] = int(
+                params['random_state'].integers(np.iinfo(np.int32).max)
+            )
 
         params = _choose_param_value(
             main_param_name="early_stopping_round",
diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml
index 6e43dc242d1b..d5586aa3857d 100644
--- a/python-package/pyproject.toml
+++ b/python-package/pyproject.toml
@@ -30,9 +30,13 @@ maintainers = [
 name = "lightgbm"
 readme = "README.rst"
 requires-python = ">=3.6"
-version = "4.1.0.99"
+version = "4.2.0.99"
 
 [project.optional-dependencies]
+arrow = [
+    "cffi>=1.15.1",
+    "pyarrow>=6.0.1"
+]
 dask = [
     "dask[array,dataframe,distributed]>=2.0.0",
     "pandas>=0.24.0"
@@ -59,7 +63,7 @@ build-backend = "scikit_build_core.build"
 # based on https://github.com/scikit-build/scikit-build-core#configuration
 [tool.scikit-build]
 
-cmake.minimum-version = "3.15"
+cmake.minimum-version = "3.18"
 ninja.minimum-version = "1.11"
 ninja.make-fallback = true
 cmake.args = [
diff --git a/src/application/application.cpp b/src/application/application.cpp
index 3e51136afc96..0bb9eca13bf2 100644
--- a/src/application/application.cpp
+++ b/src/application/application.cpp
@@ -227,7 +227,7 @@ void Application::Predict() {
     TextReader<int> result_reader(config_.output_result.c_str(), false);
     result_reader.ReadAllLines();
     std::vector<std::vector<int>> pred_leaf(result_reader.Lines().size());
-    #pragma omp parallel for schedule(static)
+    #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
     for (int i = 0; i < static_cast<int>(result_reader.Lines().size()); ++i) {
       pred_leaf[i] = Common::StringToArray<int>(result_reader.Lines()[i], '\t');
       // Free memory
diff --git a/src/application/predictor.hpp b/src/application/predictor.hpp
index d1a8aca4d041..18eb01e30179 100644
--- a/src/application/predictor.hpp
+++ b/src/application/predictor.hpp
@@ -233,7 +233,7 @@ class Predictor {
       std::vector<std::pair<int, double>> oneline_features;
       std::vector<std::string> result_to_write(lines.size());
       OMP_INIT_EX();
-      #pragma omp parallel for schedule(static) firstprivate(oneline_features)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) firstprivate(oneline_features)
       for (data_size_t i = 0; i < static_cast<data_size_t>(lines.size()); ++i) {
         OMP_LOOP_EX_BEGIN();
         oneline_features.clear();
diff --git a/src/boosting/cuda/cuda_score_updater.hpp b/src/boosting/cuda/cuda_score_updater.hpp
index ec728777e66c..cb79b43b9f36 100644
--- a/src/boosting/cuda/cuda_score_updater.hpp
+++ b/src/boosting/cuda/cuda_score_updater.hpp
@@ -8,7 +8,7 @@
 
 #ifdef USE_CUDA
 
-#include <LightGBM/cuda/cuda_utils.h>
+#include <LightGBM/cuda/cuda_utils.hu>
 
 #include "../score_updater.hpp"
 
diff --git a/src/boosting/gbdt.cpp b/src/boosting/gbdt.cpp
index 1f0a5405bf49..b75adab6d971 100644
--- a/src/boosting/gbdt.cpp
+++ b/src/boosting/gbdt.cpp
@@ -255,7 +255,7 @@ void GBDT::RefitTree(const std::vector<std::vector<int>>& tree_leaf_prediction)
   std::vector<int> leaf_pred(num_data_);
   if (linear_tree_) {
     std::vector<int> max_leaves_by_thread = std::vector<int>(OMP_NUM_THREADS(), 0);
-    #pragma omp parallel for schedule(static)
+    #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
     for (int i = 0; i < static_cast<int>(tree_leaf_prediction.size()); ++i) {
       int tid = omp_get_thread_num();
       for (size_t j = 0; j < tree_leaf_prediction[i].size(); ++j) {
@@ -270,7 +270,7 @@ void GBDT::RefitTree(const std::vector<std::vector<int>>& tree_leaf_prediction)
     Boosting();
     for (int tree_id = 0; tree_id < num_tree_per_iteration_; ++tree_id) {
       int model_index = iter * num_tree_per_iteration_ + tree_id;
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
       for (int i = 0; i < num_data_; ++i) {
         leaf_pred[i] = tree_leaf_prediction[i][model_index];
         CHECK_LT(leaf_pred[i], models_[model_index]->num_leaves());
@@ -348,7 +348,7 @@ bool GBDT::TrainOneIter(const score_t* gradients, const score_t* hessians) {
     if (data_sample_strategy_->IsHessianChange()) {
       // need to copy customized gradients when using GOSS
       int64_t total_size = static_cast<int64_t>(num_data_) * num_tree_per_iteration_;
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
       for (int64_t i = 0; i < total_size; ++i) {
         gradients_[i] = gradients[i];
         hessians_[i] = hessians[i];
@@ -669,7 +669,7 @@ void GBDT::GetPredictAt(int data_idx, double* out_result, int64_t* out_len) {
   }
   #endif  // USE_CUDA
   if (objective_function_ != nullptr) {
-    #pragma omp parallel for schedule(static)
+    #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
     for (data_size_t i = 0; i < num_data; ++i) {
       std::vector<double> tree_pred(num_tree_per_iteration_);
       for (int j = 0; j < num_tree_per_iteration_; ++j) {
@@ -682,7 +682,7 @@ void GBDT::GetPredictAt(int data_idx, double* out_result, int64_t* out_len) {
       }
     }
   } else {
-    #pragma omp parallel for schedule(static)
+    #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
     for (data_size_t i = 0; i < num_data; ++i) {
       for (int j = 0; j < num_tree_per_iteration_; ++j) {
         out_result[j * num_data + i] = static_cast<double>(raw_scores[j * num_data + i]);
diff --git a/src/boosting/gbdt.h b/src/boosting/gbdt.h
index e38b26be3e14..28ebee446fad 100644
--- a/src/boosting/gbdt.h
+++ b/src/boosting/gbdt.h
@@ -179,15 +179,20 @@ class GBDT : public GBDTBase {
       const auto pair = Common::Split(line.c_str(), ":");
       if (pair[1] == " ]")
         continue;
+      const auto param = pair[0].substr(1);
+      const auto value_str = pair[1].substr(1, pair[1].size() - 2);
+      auto iter = param_types.find(param);
+      if (iter == param_types.end()) {
+        Log::Warning("Ignoring unrecognized parameter '%s' found in model string.", param.c_str());
+        continue;
+      }
+      std::string param_type = iter->second;
       if (first) {
         first = false;
         str_buf << "\"";
       } else {
         str_buf << ",\"";
       }
-      const auto param = pair[0].substr(1);
-      const auto value_str = pair[1].substr(1, pair[1].size() - 2);
-      const auto param_type = param_types.at(param);
       str_buf << param << "\": ";
       if (param_type == "string") {
         str_buf << "\"" << value_str << "\"";
@@ -429,7 +434,7 @@ class GBDT : public GBDTBase {
     }
     start_iteration_for_pred_ = start_iteration;
     if (is_pred_contrib) {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
       for (int i = 0; i < static_cast<int>(models_.size()); ++i) {
         models_[i]->RecomputeMaxDepth();
       }
diff --git a/src/boosting/gbdt_model_text.cpp b/src/boosting/gbdt_model_text.cpp
index 73c3ea98d3f6..27be5afe066e 100644
--- a/src/boosting/gbdt_model_text.cpp
+++ b/src/boosting/gbdt_model_text.cpp
@@ -354,7 +354,7 @@ std::string GBDT::SaveModelToString(int start_iteration, int num_iteration, int
   std::vector<std::string> tree_strs(num_used_model - start_model);
   std::vector<size_t> tree_sizes(num_used_model - start_model);
   // output tree models
-  #pragma omp parallel for schedule(static)
+  #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
   for (int i = start_model; i < num_used_model; ++i) {
     const int idx = i - start_model;
     tree_strs[idx] = "Tree=" + std::to_string(idx) + '\n';
@@ -552,7 +552,7 @@ bool GBDT::LoadModelFromString(const char* buffer, size_t len) {
       models_.emplace_back(nullptr);
     }
     OMP_INIT_EX();
-    #pragma omp parallel for schedule(static)
+    #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
     for (int i = 0; i < num_trees; ++i) {
       OMP_LOOP_EX_BEGIN();
       auto cur_p = p + tree_boundries[i];
diff --git a/src/boosting/rf.hpp b/src/boosting/rf.hpp
index 88ece154e432..e6101dc30a39 100644
--- a/src/boosting/rf.hpp
+++ b/src/boosting/rf.hpp
@@ -97,7 +97,7 @@ class RF : public GBDT {
     }
     size_t total_size = static_cast<size_t>(num_data_) * num_tree_per_iteration_;
     std::vector<double> tmp_scores(total_size, 0.0f);
-    #pragma omp parallel for schedule(static)
+    #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
     for (int j = 0; j < num_tree_per_iteration_; ++j) {
       size_t offset = static_cast<size_t>(j)* num_data_;
       for (data_size_t i = 0; i < num_data_; ++i) {
diff --git a/src/boosting/score_updater.hpp b/src/boosting/score_updater.hpp
index 6e475455df7a..2333b36ec029 100644
--- a/src/boosting/score_updater.hpp
+++ b/src/boosting/score_updater.hpp
@@ -39,7 +39,7 @@ class ScoreUpdater {
         Log::Fatal("Number of class for initial score error");
       }
       has_init_score_ = true;
-#pragma omp parallel for schedule(static, 512) if (total_size >= 1024)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (total_size >= 1024)
       for (int64_t i = 0; i < total_size; ++i) {
         score_[i] = init_score[i];
       }
@@ -54,7 +54,7 @@ class ScoreUpdater {
   virtual inline void AddScore(double val, int cur_tree_id) {
     Common::FunctionTimer fun_timer("ScoreUpdater::AddScore", global_timer);
     const size_t offset = static_cast<size_t>(num_data_) * cur_tree_id;
-#pragma omp parallel for schedule(static, 512) if (num_data_ >= 1024)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_data_ >= 1024)
     for (int i = 0; i < num_data_; ++i) {
       score_[offset + i] += val;
     }
@@ -62,7 +62,7 @@ class ScoreUpdater {
 
   virtual inline void MultiplyScore(double val, int cur_tree_id) {
     const size_t offset = static_cast<size_t>(num_data_) * cur_tree_id;
-#pragma omp parallel for schedule(static, 512) if (num_data_ >= 1024)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_data_ >= 1024)
     for (int i = 0; i < num_data_; ++i) {
       score_[offset + i] *= val;
     }
diff --git a/src/c_api.cpp b/src/c_api.cpp
index 8c4eee96b4c9..67b18003588a 100644
--- a/src/c_api.cpp
+++ b/src/c_api.cpp
@@ -4,6 +4,7 @@
  */
 #include <LightGBM/c_api.h>
 
+#include <LightGBM/arrow.h>
 #include <LightGBM/boosting.h>
 #include <LightGBM/config.h>
 #include <LightGBM/dataset.h>
@@ -437,7 +438,7 @@ class Booster {
     int64_t num_pred_in_one_row = boosting_->NumPredictOneRow(start_iteration, num_iteration, is_predict_leaf, predict_contrib);
     auto pred_fun = predictor.GetPredictFunction();
     OMP_INIT_EX();
-    #pragma omp parallel for schedule(static)
+    #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
     for (int i = 0; i < nrow; ++i) {
       OMP_LOOP_EX_BEGIN();
       auto one_row = get_row_fun(i);
@@ -459,7 +460,7 @@ class Booster {
     auto pred_sparse_fun = predictor.GetPredictSparseFunction();
     std::vector<std::vector<std::unordered_map<int, double>>>& agg = *agg_ptr;
     OMP_INIT_EX();
-    #pragma omp parallel for schedule(static)
+    #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
     for (int64_t i = 0; i < nrow; ++i) {
       OMP_LOOP_EX_BEGIN();
       auto one_row = get_row_fun(i);
@@ -551,7 +552,7 @@ class Booster {
       indptr_index++;
       int64_t matrix_start_index = m * static_cast<int64_t>(agg.size());
       OMP_INIT_EX();
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
       for (int64_t i = 0; i < static_cast<int64_t>(agg.size()); ++i) {
         OMP_LOOP_EX_BEGIN();
         auto row_vector = agg[i];
@@ -663,7 +664,7 @@ class Booster {
     }
     // Note: we parallelize across matrices instead of rows because of the column_counts[m][col_idx] increment inside the loop
     OMP_INIT_EX();
-    #pragma omp parallel for schedule(static)
+    #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
     for (int m = 0; m < num_matrices; ++m) {
       OMP_LOOP_EX_BEGIN();
       for (int64_t i = 0; i < static_cast<int64_t>(agg.size()); ++i) {
@@ -832,6 +833,8 @@ class Booster {
 
 // explicitly declare symbols from LightGBM namespace
 using LightGBM::AllgatherFunction;
+using LightGBM::ArrowChunkedArray;
+using LightGBM::ArrowTable;
 using LightGBM::Booster;
 using LightGBM::Common::CheckElementsIntervalClosed;
 using LightGBM::Common::RemoveQuotationSymbol;
@@ -1074,7 +1077,7 @@ int LGBM_DatasetPushRows(DatasetHandle dataset,
     p_dataset->ResizeRaw(p_dataset->num_numeric_features() + nrow);
   }
   OMP_INIT_EX();
-  #pragma omp parallel for schedule(static)
+  #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
   for (int i = 0; i < nrow; ++i) {
     OMP_LOOP_EX_BEGIN();
     const int tid = omp_get_thread_num();
@@ -1116,7 +1119,7 @@ int LGBM_DatasetPushRowsWithMetadata(DatasetHandle dataset,
   const int max_omp_threads = p_dataset->omp_max_threads() > 0 ? p_dataset->omp_max_threads() : OMP_NUM_THREADS();
 
   OMP_INIT_EX();
-#pragma omp parallel for schedule(static)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
   for (int i = 0; i < nrow; ++i) {
     OMP_LOOP_EX_BEGIN();
     // convert internal thread id to be unique based on external thread id
@@ -1153,7 +1156,7 @@ int LGBM_DatasetPushRowsByCSR(DatasetHandle dataset,
     p_dataset->ResizeRaw(p_dataset->num_numeric_features() + nrow);
   }
   OMP_INIT_EX();
-  #pragma omp parallel for schedule(static)
+  #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
   for (int i = 0; i < nrow; ++i) {
     OMP_LOOP_EX_BEGIN();
     const int tid = omp_get_thread_num();
@@ -1199,7 +1202,7 @@ int LGBM_DatasetPushRowsByCSRWithMetadata(DatasetHandle dataset,
   const int max_omp_threads = p_dataset->omp_max_threads() > 0 ? p_dataset->omp_max_threads() : OMP_NUM_THREADS();
 
   OMP_INIT_EX();
-#pragma omp parallel for schedule(static)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
   for (int i = 0; i < nrow; ++i) {
     OMP_LOOP_EX_BEGIN();
     // convert internal thread id to be unique based on external thread id
@@ -1319,7 +1322,7 @@ int LGBM_DatasetCreateFromMats(int32_t nmat,
   int32_t start_row = 0;
   for (int j = 0; j < nmat; ++j) {
     OMP_INIT_EX();
-    #pragma omp parallel for schedule(static)
+    #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
     for (int i = 0; i < nrow[j]; ++i) {
       OMP_LOOP_EX_BEGIN();
       const int tid = omp_get_thread_num();
@@ -1394,7 +1397,7 @@ int LGBM_DatasetCreateFromCSR(const void* indptr,
     }
   }
   OMP_INIT_EX();
-  #pragma omp parallel for schedule(static)
+  #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
   for (int i = 0; i < nindptr - 1; ++i) {
     OMP_LOOP_EX_BEGIN();
     const int tid = omp_get_thread_num();
@@ -1465,7 +1468,7 @@ int LGBM_DatasetCreateFromCSRFunc(void* get_row_funptr,
 
   OMP_INIT_EX();
   std::vector<std::pair<int, double>> thread_buffer;
-  #pragma omp parallel for schedule(static) private(thread_buffer)
+  #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) private(thread_buffer)
   for (int i = 0; i < num_rows; ++i) {
     OMP_LOOP_EX_BEGIN();
     {
@@ -1506,7 +1509,7 @@ int LGBM_DatasetCreateFromCSC(const void* col_ptr,
     std::vector<std::vector<double>> sample_values(ncol_ptr - 1);
     std::vector<std::vector<int>> sample_idx(ncol_ptr - 1);
     OMP_INIT_EX();
-    #pragma omp parallel for schedule(static)
+    #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
     for (int i = 0; i < static_cast<int>(sample_values.size()); ++i) {
       OMP_LOOP_EX_BEGIN();
       CSC_RowIterator col_it(col_ptr, col_ptr_type, indices, data, data_type, ncol_ptr, nelem, i);
@@ -1534,7 +1537,7 @@ int LGBM_DatasetCreateFromCSC(const void* col_ptr,
       reinterpret_cast<const Dataset*>(reference));
   }
   OMP_INIT_EX();
-  #pragma omp parallel for schedule(static)
+  #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
   for (int i = 0; i < ncol_ptr - 1; ++i) {
     OMP_LOOP_EX_BEGIN();
     const int tid = omp_get_thread_num();
@@ -1567,6 +1570,98 @@ int LGBM_DatasetCreateFromCSC(const void* col_ptr,
   API_END();
 }
 
+int LGBM_DatasetCreateFromArrow(int64_t n_chunks,
+                                const ArrowArray* chunks,
+                                const ArrowSchema* schema,
+                                const char* parameters,
+                                const DatasetHandle reference,
+                                DatasetHandle *out) {
+  API_BEGIN();
+
+  auto param = Config::Str2Map(parameters);
+  Config config;
+  config.Set(param);
+  OMP_SET_NUM_THREADS(config.num_threads);
+
+  std::unique_ptr<Dataset> ret;
+
+  // Prepare the Arrow data
+  ArrowTable table(n_chunks, chunks, schema);
+
+  // Initialize the dataset
+  if (reference == nullptr) {
+    // If there is no reference dataset, we first sample indices
+    auto sample_indices = CreateSampleIndices(static_cast<int32_t>(table.get_num_rows()), config);
+    auto sample_count = static_cast<int>(sample_indices.size());
+    std::vector<std::vector<double>> sample_values(table.get_num_columns());
+    std::vector<std::vector<int>> sample_idx(table.get_num_columns());
+
+    // Then, we obtain sample values by parallelizing across columns
+    OMP_INIT_EX();
+    #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
+    for (int64_t j = 0; j < table.get_num_columns(); ++j) {
+      OMP_LOOP_EX_BEGIN();
+
+      // Values need to be copied from the record batches.
+      sample_values[j].reserve(sample_indices.size());
+      sample_idx[j].reserve(sample_indices.size());
+
+      // The chunks are iterated over in the inner loop as columns can be treated independently.
+      int last_idx = 0;
+      int i = 0;
+      auto it = table.get_column(j).begin<double>();
+      for (auto idx : sample_indices) {
+        std::advance(it, idx - last_idx);
+        auto v = *it;
+        if (std::fabs(v) > kZeroThreshold || std::isnan(v)) {
+          sample_values[j].emplace_back(v);
+          sample_idx[j].emplace_back(i);
+        }
+        last_idx = idx;
+        i++;
+      }
+      OMP_LOOP_EX_END();
+    }
+    OMP_THROW_EX();
+
+    // Finally, we initialize a loader from the sampled values
+    DatasetLoader loader(config, nullptr, 1, nullptr);
+    ret.reset(loader.ConstructFromSampleData(Vector2Ptr<double>(&sample_values).data(),
+                                             Vector2Ptr<int>(&sample_idx).data(),
+                                             table.get_num_columns(),
+                                             VectorSize<double>(sample_values).data(),
+                                             sample_count,
+                                             table.get_num_rows(),
+                                             table.get_num_rows()));
+  } else {
+    ret.reset(new Dataset(static_cast<data_size_t>(table.get_num_rows())));
+    ret->CreateValid(reinterpret_cast<const Dataset*>(reference));
+    if (ret->has_raw()) {
+      ret->ResizeRaw(static_cast<int>(table.get_num_rows()));
+    }
+  }
+
+  // After sampling and properly initializing all bins, we can add our data to the dataset. Here,
+  // we parallelize across rows.
+  OMP_INIT_EX();
+  #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
+  for (int64_t j = 0; j < table.get_num_columns(); ++j) {
+    OMP_LOOP_EX_BEGIN();
+    const int tid = omp_get_thread_num();
+    data_size_t idx = 0;
+    auto column = table.get_column(j);
+    for (auto it = column.begin<double>(), end = column.end<double>(); it != end; ++it) {
+      ret->PushOneValue(tid, idx++, j, *it);
+    }
+    OMP_LOOP_EX_END();
+  }
+  OMP_THROW_EX();
+
+  ret->FinishLoad();
+  *out = ret.release();
+  API_END();
+}
+
 int LGBM_DatasetGetSubset(
   const DatasetHandle handle,
   const int32_t* used_row_indices,
@@ -1686,6 +1781,21 @@ int LGBM_DatasetSetField(DatasetHandle handle,
   API_END();
 }
 
+int LGBM_DatasetSetFieldFromArrow(DatasetHandle handle,
+                                  const char* field_name,
+                                  int64_t n_chunks,
+                                  const ArrowArray* chunks,
+                                  const ArrowSchema* schema) {
+  API_BEGIN();
+  auto dataset = reinterpret_cast<Dataset*>(handle);
+  ArrowChunkedArray ca(n_chunks, chunks, schema);
+  auto is_success = dataset->SetFieldFromArrow(field_name, ca);
+  if (!is_success) {
+    Log::Fatal("Input field is not supported");
+  }
+  API_END();
+}
+
 int LGBM_DatasetGetField(DatasetHandle handle,
                          const char* field_name,
                          int* out_len,
@@ -2458,6 +2568,57 @@ int LGBM_BoosterPredictForMats(BoosterHandle handle,
   API_END();
 }
 
+int LGBM_BoosterPredictForArrow(BoosterHandle handle,
+                                int64_t n_chunks,
+                                const ArrowArray* chunks,
+                                const ArrowSchema* schema,
+                                int predict_type,
+                                int start_iteration,
+                                int num_iteration,
+                                const char* parameter,
+                                int64_t* out_len,
+                                double* out_result) {
+  API_BEGIN();
+
+  // Apply the configuration
+  auto param = Config::Str2Map(parameter);
+  Config config;
+  config.Set(param);
+  OMP_SET_NUM_THREADS(config.num_threads);
+
+  // Set up chunked array and iterators for all columns
+  ArrowTable table(n_chunks, chunks, schema);
+  std::vector<ArrowChunkedArray::Iterator<double>> its;
+  its.reserve(table.get_num_columns());
+  for (int64_t j = 0; j < table.get_num_columns(); ++j) {
+    its.emplace_back(table.get_column(j).begin<double>());
+  }
+
+  // Build row function
+  auto num_columns = table.get_num_columns();
+  auto row_fn = [num_columns, &its] (int row_idx) {
+    std::vector<std::pair<int, double>> result;
+    result.reserve(num_columns);
+    for (int64_t j = 0; j < num_columns; ++j) {
+      result.emplace_back(static_cast<int>(j), its[j][row_idx]);
+    }
+    return result;
+  };
+
+  // Run prediction
+  Booster* ref_booster = reinterpret_cast<Booster*>(handle);
+  ref_booster->Predict(start_iteration,
+                       num_iteration,
+                       predict_type,
+                       static_cast<int>(table.get_num_rows()),
+                       static_cast<int>(table.get_num_columns()),
+                       row_fn,
+                       config,
+                       out_result,
+                       out_len);
+  API_END();
+}
+
 int LGBM_BoosterSaveModel(BoosterHandle handle,
                           int start_iteration,
                           int num_iteration,
@@ -2589,6 +2750,23 @@ int LGBM_NetworkInitWithFunctions(int num_machines, int rank,
   API_END();
 }
 
+int LGBM_SetMaxThreads(int num_threads) {
+  API_BEGIN();
+  if (num_threads <= 0) {
+    LGBM_MAX_NUM_THREADS = -1;
+  } else {
+    LGBM_MAX_NUM_THREADS = num_threads;
+  }
+  API_END();
+}
+
+int LGBM_GetMaxThreads(int* out) {
+  API_BEGIN();
+  *out = LGBM_MAX_NUM_THREADS;
+  API_END();
+}
+
+
 // ---- start of some help functions
 
 
diff --git a/src/cuda/cuda_utils.cpp b/src/cuda/cuda_utils.cpp
index a7d0df697e24..b601f9395268 100644
--- a/src/cuda/cuda_utils.cpp
+++ b/src/cuda/cuda_utils.cpp
@@ -5,7 +5,7 @@
 
 #ifdef USE_CUDA
 
-#include <LightGBM/cuda/cuda_utils.h>
+#include <LightGBM/cuda/cuda_utils.hu>
 
 namespace LightGBM {
 
diff --git a/src/io/config.cpp b/src/io/config.cpp
index e8578046960a..e25bb6d4fd70 100644
--- a/src/io/config.cpp
+++ b/src/io/config.cpp
@@ -389,10 +389,6 @@ void Config::CheckParamConflict() {
     if (deterministic) {
       Log::Warning("Although \"deterministic\" is set, the results ran by GPU may be non-deterministic.");
     }
-    if (use_quantized_grad) {
-      Log::Warning("Quantized training is not supported by CUDA tree learner. Switch to full precision training.");
-      use_quantized_grad = false;
-    }
   }
   // linear tree learner must be serial type and run on CPU device
   if (linear_tree) {
diff --git a/src/io/dataset.cpp b/src/io/dataset.cpp
index cd692afb031a..058d7bd328ad 100644
--- a/src/io/dataset.cpp
+++ b/src/io/dataset.cpp
@@ -536,7 +536,7 @@ MultiValBin* Dataset::GetMultiBinFromSparseFeatures(const std::vector<uint32_t>&
   std::vector<uint32_t> most_freq_bins;
   double sum_sparse_rate = 0;
   for (int i = 0; i < num_feature; ++i) {
-#pragma omp parallel for schedule(static, 1)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 1)
     for (int tid = 0; tid < num_threads; ++tid) {
       iters[tid].emplace_back(
           feature_groups_[multi_group_id]->SubFeatureIterator(i));
@@ -584,7 +584,7 @@ MultiValBin* Dataset::GetMultiBinFromAllFeatures(const std::vector<uint32_t>& of
       for (int fid = 0; fid < feature_groups_[gid]->num_feature_; ++fid) {
         const auto& bin_mapper = feature_groups_[gid]->bin_mappers_[fid];
         most_freq_bins.push_back(bin_mapper->GetMostFreqBin());
-#pragma omp parallel for schedule(static, 1)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 1)
         for (int tid = 0; tid < num_threads; ++tid) {
           iters[tid].emplace_back(
               feature_groups_[gid]->SubFeatureIterator(fid));
@@ -823,7 +823,7 @@ void Dataset::ReSize(data_size_t num_data) {
   if (num_data_ != num_data) {
     num_data_ = num_data;
     OMP_INIT_EX();
-#pragma omp parallel for schedule(static)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
     for (int group = 0; group < num_groups_; ++group) {
       OMP_LOOP_EX_BEGIN();
       feature_groups_[group]->ReSize(num_data_);
@@ -856,7 +856,7 @@ void Dataset::CopySubrow(const Dataset* fullset,
   int num_copy_tasks = static_cast<int>(group_ids.size());
 
   OMP_INIT_EX();
-  #pragma omp parallel for schedule(dynamic)
+  #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(dynamic)
   for (int task_id = 0; task_id < num_copy_tasks; ++task_id) {
     OMP_LOOP_EX_BEGIN();
     int group = group_ids[task_id];
@@ -875,7 +875,7 @@ void Dataset::CopySubrow(const Dataset* fullset,
   num_numeric_features_ = fullset->num_numeric_features_;
   if (has_raw_) {
     ResizeRaw(num_used_indices);
-#pragma omp parallel for schedule(static)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
     for (int i = 0; i < num_used_indices; ++i) {
       for (int j = 0; j < num_numeric_features_; ++j) {
         raw_data_[j][i] = fullset->raw_data_[j][used_indices[i]];
@@ -897,6 +897,23 @@ void Dataset::CopySubrow(const Dataset* fullset,
   #endif  // USE_CUDA
 }
 
+bool Dataset::SetFieldFromArrow(const char* field_name, const ArrowChunkedArray &ca) {
+  std::string name(field_name);
+  name = Common::Trim(name);
+  if (name == std::string("label") || name == std::string("target")) {
+    metadata_.SetLabel(ca);
+  } else if (name == std::string("weight") || name == std::string("weights")) {
+    metadata_.SetWeights(ca);
+  } else if (name == std::string("init_score")) {
+    metadata_.SetInitScore(ca);
+  } else if (name == std::string("query") || name == std::string("group")) {
+    metadata_.SetQuery(ca);
+  } else {
+    return false;
+  }
+  return true;
+}
+
 bool Dataset::SetFloatField(const char* field_name, const float* field_data,
                             data_size_t num_element) {
   std::string name(field_name);
@@ -1282,7 +1299,7 @@ void Dataset::ConstructHistogramsInner(
       int16_t* ordered_gradients_and_hessians = reinterpret_cast<int16_t*>(ordered_gradients);
       const int16_t* gradients_and_hessians = reinterpret_cast<const int16_t*>(gradients);
       if (USE_INDICES) {
-  #pragma omp parallel for schedule(static, 512) if (num_data >= 1024)
+  #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_data >= 1024)
         for (data_size_t i = 0; i < num_data; ++i) {
           ordered_gradients_and_hessians[i] = gradients_and_hessians[data_indices[i]];
         }
@@ -1292,7 +1309,7 @@ void Dataset::ConstructHistogramsInner(
     } else {
       if (USE_INDICES) {
         if (USE_HESSIAN) {
-  #pragma omp parallel for schedule(static, 512) if (num_data >= 1024)
+  #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_data >= 1024)
           for (data_size_t i = 0; i < num_data; ++i) {
             ordered_gradients[i] = gradients[data_indices[i]];
             ordered_hessians[i] = hessians[data_indices[i]];
@@ -1300,7 +1317,7 @@ void Dataset::ConstructHistogramsInner(
           ptr_ordered_grad = ordered_gradients;
           ptr_ordered_hess = ordered_hessians;
         } else {
-  #pragma omp parallel for schedule(static, 512) if (num_data >= 1024)
+  #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_data >= 1024)
           for (data_size_t i = 0; i < num_data; ++i) {
             ordered_gradients[i] = gradients[data_indices[i]];
           }
diff --git a/src/io/dataset_loader.cpp b/src/io/dataset_loader.cpp
index 354936cfb01a..84bf3907a43c 100644
--- a/src/io/dataset_loader.cpp
+++ b/src/io/dataset_loader.cpp
@@ -625,7 +625,7 @@ Dataset* DatasetLoader::ConstructFromSampleData(double** sample_values,
   if (Network::num_machines() == 1) {
     // if only one machine, find bin locally
     OMP_INIT_EX();
-    #pragma omp parallel for schedule(guided)
+    #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(guided)
     for (int i = 0; i < num_col; ++i) {
       OMP_LOOP_EX_BEGIN();
       if (ignore_features_.count(i) > 0) {
@@ -674,7 +674,7 @@ Dataset* DatasetLoader::ConstructFromSampleData(double** sample_values,
     }
     len[num_machines - 1] = num_total_features - start[num_machines - 1];
     OMP_INIT_EX();
-    #pragma omp parallel for schedule(guided)
+    #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(guided)
     for (int i = 0; i < len[rank]; ++i) {
       OMP_LOOP_EX_BEGIN();
       if (ignore_features_.count(start[rank] + i) > 0) {
@@ -1136,7 +1136,7 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
   if (num_machines == 1) {
     // if only one machine, find bin locally
     OMP_INIT_EX();
-    #pragma omp parallel for schedule(guided)
+    #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(guided)
     for (int i = 0; i < static_cast<int>(sample_values.size()); ++i) {
       OMP_LOOP_EX_BEGIN();
       if (ignore_features_.count(i) > 0) {
@@ -1177,7 +1177,7 @@ void DatasetLoader::ConstructBinMappersFromTextData(int rank, int num_machines,
     }
     len[num_machines - 1] = dataset->num_total_features_ - start[num_machines - 1];
     OMP_INIT_EX();
-    #pragma omp parallel for schedule(guided)
+    #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(guided)
     for (int i = 0; i < len[rank]; ++i) {
       OMP_LOOP_EX_BEGIN();
       if (ignore_features_.count(start[rank] + i) > 0) {
@@ -1268,7 +1268,7 @@ void DatasetLoader::ExtractFeaturesFromMemory(std::vector<std::string>* text_dat
   if (!predict_fun_) {
     OMP_INIT_EX();
     // if doesn't need to prediction with initial model
-    #pragma omp parallel for schedule(static) private(oneline_features) firstprivate(tmp_label, feature_row)
+    #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) private(oneline_features) firstprivate(tmp_label, feature_row)
     for (data_size_t i = 0; i < dataset->num_data_; ++i) {
       OMP_LOOP_EX_BEGIN();
       const int tid = omp_get_thread_num();
@@ -1319,7 +1319,7 @@ void DatasetLoader::ExtractFeaturesFromMemory(std::vector<std::string>* text_dat
     OMP_INIT_EX();
     // if need to prediction with initial model
     std::vector<double> init_score(static_cast<size_t>(dataset->num_data_) * num_class_);
-    #pragma omp parallel for schedule(static) private(oneline_features) firstprivate(tmp_label, feature_row)
+    #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) private(oneline_features) firstprivate(tmp_label, feature_row)
     for (data_size_t i = 0; i < dataset->num_data_; ++i) {
       OMP_LOOP_EX_BEGIN();
       const int tid = omp_get_thread_num();
@@ -1394,7 +1394,7 @@ void DatasetLoader::ExtractFeaturesFromFile(const char* filename, const Parser*
     double tmp_label = 0.0f;
     std::vector<float> feature_row(dataset->num_features_);
     OMP_INIT_EX();
-    #pragma omp parallel for schedule(static) private(oneline_features) firstprivate(tmp_label, feature_row)
+    #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) private(oneline_features) firstprivate(tmp_label, feature_row)
     for (data_size_t i = 0; i < static_cast<data_size_t>(lines.size()); ++i) {
       OMP_LOOP_EX_BEGIN();
       const int tid = omp_get_thread_num();
diff --git a/src/io/metadata.cpp b/src/io/metadata.cpp
index 1fc47c46787f..55440649f55e 100644
--- a/src/io/metadata.cpp
+++ b/src/io/metadata.cpp
@@ -101,7 +101,7 @@ void Metadata::Init(const Metadata& fullset, const data_size_t* used_indices, da
   num_data_ = num_used_indices;
 
   label_ = std::vector<label_t>(num_used_indices);
-#pragma omp parallel for schedule(static, 512) if (num_used_indices >= 1024)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_used_indices >= 1024)
   for (data_size_t i = 0; i < num_used_indices; ++i) {
     label_[i] = fullset.label_[used_indices[i]];
   }
@@ -109,7 +109,7 @@ void Metadata::Init(const Metadata& fullset, const data_size_t* used_indices, da
   if (!fullset.weights_.empty()) {
     weights_ = std::vector<label_t>(num_used_indices);
     num_weights_ = num_used_indices;
-#pragma omp parallel for schedule(static, 512) if (num_used_indices >= 1024)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_used_indices >= 1024)
     for (data_size_t i = 0; i < num_used_indices; ++i) {
       weights_[i] = fullset.weights_[used_indices[i]];
     }
@@ -121,7 +121,7 @@ void Metadata::Init(const Metadata& fullset, const data_size_t* used_indices, da
     int num_class = static_cast<int>(fullset.num_init_score_ / fullset.num_data_);
     init_score_ = std::vector<double>(static_cast<size_t>(num_used_indices) * num_class);
     num_init_score_ = static_cast<int64_t>(num_used_indices) * num_class;
-    #pragma omp parallel for schedule(static)
+    #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
     for (int k = 0; k < num_class; ++k) {
       const size_t offset_dest = static_cast<size_t>(k) * num_data_;
       const size_t offset_src = static_cast<size_t>(k) * fullset.num_data_;
@@ -173,7 +173,7 @@ void Metadata::PartitionLabel(const std::vector<data_size_t>& used_indices) {
   auto old_label = label_;
   num_data_ = static_cast<data_size_t>(used_indices.size());
   label_ = std::vector<label_t>(num_data_);
-#pragma omp parallel for schedule(static, 512) if (num_data_ >= 1024)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_data_ >= 1024)
   for (data_size_t i = 0; i < num_data_; ++i) {
     label_[i] = old_label[used_indices[i]];
   }
@@ -255,7 +255,7 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector<data
         auto old_weights = weights_;
         num_weights_ = num_data_;
         weights_ = std::vector<label_t>(num_data_);
-#pragma omp parallel for schedule(static, 512)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512)
         for (int i = 0; i < static_cast<int>(used_data_indices.size()); ++i) {
           weights_[i] = old_weights[used_data_indices[i]];
         }
@@ -274,7 +274,7 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector<data
         auto old_positions = positions_;
         num_positions_ = num_data_;
         positions_ = std::vector<data_size_t>(num_data_);
-        #pragma omp parallel for schedule(static, 512)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512)
         for (int i = 0; i < static_cast<int>(used_data_indices.size()); ++i) {
           positions_[i] = old_positions[used_data_indices[i]];
         }
@@ -335,7 +335,7 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector<data
         int num_class = static_cast<int>(num_init_score_ / num_all_data);
         num_init_score_ = static_cast<int64_t>(num_data_) * num_class;
         init_score_ = std::vector<double>(num_init_score_);
-#pragma omp parallel for schedule(static)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
         for (int k = 0; k < num_class; ++k) {
           const size_t offset_dest = static_cast<size_t>(k) * num_data_;
           const size_t offset_src = static_cast<size_t>(k) * num_all_data;
@@ -355,32 +355,44 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector<data
   }
 }
 
-void Metadata::SetInitScore(const double* init_score, data_size_t len) {
+template <typename It>
+void Metadata::SetInitScoresFromIterator(It first, It last) {
   std::lock_guard<std::mutex> lock(mutex_);
-  // save to nullptr
-  if (init_score == nullptr || len == 0) {
+  // Clear init scores on empty input
+  if (last - first == 0) {
     init_score_.clear();
     num_init_score_ = 0;
     return;
   }
-  if ((len % num_data_) != 0) {
+  if (((last - first) % num_data_) != 0) {
     Log::Fatal("Initial score size doesn't match data size");
   }
-  if (init_score_.empty()) { init_score_.resize(len); }
-  num_init_score_ = len;
+  if (init_score_.empty()) {
+    init_score_.resize(last - first);
+  }
+  num_init_score_ = last - first;
 
-  #pragma omp parallel for schedule(static, 512) if (num_init_score_ >= 1024)
+  #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_init_score_ >= 1024)
   for (int64_t i = 0; i < num_init_score_; ++i) {
-    init_score_[i] = Common::AvoidInf(init_score[i]);
+    init_score_[i] = Common::AvoidInf(first[i]);
   }
   init_score_load_from_file_ = false;
+
   #ifdef USE_CUDA
   if (cuda_metadata_ != nullptr) {
-    cuda_metadata_->SetInitScore(init_score_.data(), len);
+    cuda_metadata_->SetInitScore(init_score_.data(), init_score_.size());
   }
   #endif  // USE_CUDA
 }
 
+void Metadata::SetInitScore(const double* init_score, data_size_t len) {
+  SetInitScoresFromIterator(init_score, init_score + len);
+}
+
+void Metadata::SetInitScore(const ArrowChunkedArray& array) {
+  SetInitScoresFromIterator(array.begin<double>(), array.end<double>());
+}
+
 void Metadata::InsertInitScores(const double* init_scores, data_size_t start_index, data_size_t len, data_size_t source_size) {
   if (num_init_score_ <= 0) {
     Log::Fatal("Inserting initial score data into dataset with no initial scores");
@@ -403,27 +415,39 @@ void Metadata::InsertInitScores(const double* init_scores, data_size_t start_ind
   // CUDA is handled after all insertions are complete
 }
 
-void Metadata::SetLabel(const label_t* label, data_size_t len) {
+template <typename It>
+void Metadata::SetLabelsFromIterator(It first, It last) {
   std::lock_guard<std::mutex> lock(mutex_);
-  if (label == nullptr) {
-    Log::Fatal("label cannot be nullptr");
+  if (num_data_ != last - first) {
+    Log::Fatal("Length of labels differs from the length of #data");
   }
-  if (num_data_ != len) {
-    Log::Fatal("Length of label is not same with #data");
+  if (label_.empty()) {
+    label_.resize(num_data_);
   }
-  if (label_.empty()) { label_.resize(num_data_); }
 
-  #pragma omp parallel for schedule(static, 512) if (num_data_ >= 1024)
+  #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_data_ >= 1024)
   for (data_size_t i = 0; i < num_data_; ++i) {
-    label_[i] = Common::AvoidInf(label[i]);
+    label_[i] = Common::AvoidInf(first[i]);
   }
+
   #ifdef USE_CUDA
   if (cuda_metadata_ != nullptr) {
-    cuda_metadata_->SetLabel(label_.data(), len);
+    cuda_metadata_->SetLabel(label_.data(), label_.size());
   }
   #endif  // USE_CUDA
 }
 
+void Metadata::SetLabel(const label_t* label, data_size_t len) {
+  if (label == nullptr) {
+    Log::Fatal("label cannot be nullptr");
+  }
+  SetLabelsFromIterator(label, label + len);
+}
+
+void Metadata::SetLabel(const ArrowChunkedArray& array) {
+  SetLabelsFromIterator(array.begin<label_t>(), array.end<label_t>());
+}
+
 void Metadata::InsertLabels(const label_t* labels, data_size_t start_index, data_size_t len) {
   if (labels == nullptr) {
     Log::Fatal("label cannot be nullptr");
@@ -438,33 +462,45 @@ void Metadata::InsertLabels(const label_t* labels, data_size_t start_index, data
   // CUDA is handled after all insertions are complete
 }
 
-void Metadata::SetWeights(const label_t* weights, data_size_t len) {
+template <typename It>
+void Metadata::SetWeightsFromIterator(It first, It last) {
   std::lock_guard<std::mutex> lock(mutex_);
-  // save to nullptr
-  if (weights == nullptr || len == 0) {
+  // Clear weights on empty input
+  if (last - first == 0) {
     weights_.clear();
     num_weights_ = 0;
     return;
   }
-  if (num_data_ != len) {
-    Log::Fatal("Length of weights is not same with #data");
+  if (num_data_ != last - first) {
+    Log::Fatal("Length of weights differs from the length of #data");
+  }
+  if (weights_.empty()) {
+    weights_.resize(num_data_);
   }
-  if (weights_.empty()) { weights_.resize(num_data_); }
   num_weights_ = num_data_;
 
-  #pragma omp parallel for schedule(static, 512) if (num_weights_ >= 1024)
+  #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_weights_ >= 1024)
   for (data_size_t i = 0; i < num_weights_; ++i) {
-    weights_[i] = Common::AvoidInf(weights[i]);
+    weights_[i] = Common::AvoidInf(first[i]);
   }
   CalculateQueryWeights();
   weight_load_from_file_ = false;
+
   #ifdef USE_CUDA
   if (cuda_metadata_ != nullptr) {
-    cuda_metadata_->SetWeights(weights_.data(), len);
+    cuda_metadata_->SetWeights(weights_.data(), weights_.size());
   }
   #endif  // USE_CUDA
 }
 
+void Metadata::SetWeights(const label_t* weights, data_size_t len) {
+  SetWeightsFromIterator(weights, weights + len);
+}
+
+void Metadata::SetWeights(const ArrowChunkedArray& array) {
+  SetWeightsFromIterator(array.begin<label_t>(), array.end<label_t>());
+}
+
 void Metadata::InsertWeights(const label_t* weights, data_size_t start_index, data_size_t len) {
   if (!weights) {
     Log::Fatal("Passed null weights");
@@ -483,30 +519,34 @@ void Metadata::InsertWeights(const label_t* weights, data_size_t start_index, da
   // CUDA is handled after all insertions are complete
 }
 
-void Metadata::SetQuery(const data_size_t* query, data_size_t len) {
+template <typename It>
+void Metadata::SetQueriesFromIterator(It first, It last) {
   std::lock_guard<std::mutex> lock(mutex_);
-  // save to nullptr
-  if (query == nullptr || len == 0) {
+  // Clear query boundaries on empty input
+  if (last - first == 0) {
     query_boundaries_.clear();
     num_queries_ = 0;
     return;
   }
+
   data_size_t sum = 0;
-  #pragma omp parallel for schedule(static) reduction(+:sum)
-  for (data_size_t i = 0; i < len; ++i) {
-    sum += query[i];
+  #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum)
+  for (data_size_t i = 0; i < last - first; ++i) {
+    sum += first[i];
   }
   if (num_data_ != sum) {
-    Log::Fatal("Sum of query counts is not same with #data");
+    Log::Fatal("Sum of query counts (%i) differs from the length of #data (%i)", num_data_, sum);
   }
-  num_queries_ = len;
+  num_queries_ = last - first;
+
   query_boundaries_.resize(num_queries_ + 1);
   query_boundaries_[0] = 0;
   for (data_size_t i = 0; i < num_queries_; ++i) {
-    query_boundaries_[i + 1] = query_boundaries_[i] + query[i];
+    query_boundaries_[i + 1] = query_boundaries_[i] + first[i];
   }
   CalculateQueryWeights();
   query_load_from_file_ = false;
+
   #ifdef USE_CUDA
   if (cuda_metadata_ != nullptr) {
     if (query_weights_.size() > 0) {
@@ -519,6 +559,14 @@ void Metadata::SetQuery(const data_size_t* query, data_size_t len) {
   #endif  // USE_CUDA
 }
 
+void Metadata::SetQuery(const data_size_t* query, data_size_t len) {
+  SetQueriesFromIterator(query, query + len);
+}
+
+void Metadata::SetQuery(const ArrowChunkedArray& array) {
+  SetQueriesFromIterator(array.begin<data_size_t>(), array.end<data_size_t>());
+}
+
 void Metadata::SetPosition(const data_size_t* positions, data_size_t len) {
   std::lock_guard<std::mutex> lock(mutex_);
   // save to nullptr
@@ -554,7 +602,7 @@ void Metadata::SetPosition(const data_size_t* positions, data_size_t len) {
 
   Log::Debug("number of unique positions found = %ld", position_ids_.size());
 
-  #pragma omp parallel for schedule(static, 512) if (num_positions_ >= 1024)
+  #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_positions_ >= 1024)
   for (data_size_t i = 0; i < num_positions_; ++i) {
     positions_[i] = map_id2pos.at(positions[i]);
   }
@@ -590,7 +638,7 @@ void Metadata::LoadWeights() {
   Log::Info("Loading weights...");
   num_weights_ = static_cast<data_size_t>(reader.Lines().size());
   weights_ = std::vector<label_t>(num_weights_);
-  #pragma omp parallel for schedule(static)
+  #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
   for (data_size_t i = 0; i < num_weights_; ++i) {
     double tmp_weight = 0.0f;
     Common::Atof(reader.Lines()[i].c_str(), &tmp_weight);
@@ -645,7 +693,7 @@ void Metadata::LoadInitialScore(const std::string& data_filename) {
 
   init_score_ = std::vector<double>(num_init_score_);
   if (num_class == 1) {
-    #pragma omp parallel for schedule(static)
+    #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
     for (data_size_t i = 0; i < num_line; ++i) {
       double tmp = 0.0f;
       Common::Atof(reader.Lines()[i].c_str(), &tmp);
@@ -653,7 +701,7 @@ void Metadata::LoadInitialScore(const std::string& data_filename) {
     }
   } else {
     std::vector<std::string> oneline_init_score;
-    #pragma omp parallel for schedule(static)
+    #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
     for (data_size_t i = 0; i < num_line; ++i) {
       double tmp = 0.0f;
       oneline_init_score = Common::Split(reader.Lines()[i].c_str(), '\t');
diff --git a/src/io/multi_val_dense_bin.hpp b/src/io/multi_val_dense_bin.hpp
index 780272bdc4e1..2bab45f044cd 100644
--- a/src/io/multi_val_dense_bin.hpp
+++ b/src/io/multi_val_dense_bin.hpp
@@ -271,7 +271,7 @@ class MultiValDenseBin : public MultiValBin {
     data_size_t block_size = num_data_;
     Threading::BlockInfo<data_size_t>(num_data_, 1024, &n_block,
                                       &block_size);
-#pragma omp parallel for schedule(static, 1)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 1)
     for (int tid = 0; tid < n_block; ++tid) {
       data_size_t start = tid * block_size;
       data_size_t end = std::min(num_data_, start + block_size);
diff --git a/src/io/multi_val_sparse_bin.hpp b/src/io/multi_val_sparse_bin.hpp
index 32a5a51b4f89..edc48ca84c2a 100644
--- a/src/io/multi_val_sparse_bin.hpp
+++ b/src/io/multi_val_sparse_bin.hpp
@@ -85,7 +85,7 @@ class MultiValSparseBin : public MultiValBin {
         offsets[tid + 1] = offsets[tid] + sizes[tid + 1];
       }
       data_.resize(row_ptr_[num_data_]);
-#pragma omp parallel for schedule(static, 1)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 1)
       for (int tid = 0; tid < static_cast<int>(t_data_.size()); ++tid) {
         std::copy_n(t_data_[tid].data(), sizes[tid + 1],
                     data_.data() + offsets[tid]);
@@ -344,7 +344,7 @@ class MultiValSparseBin : public MultiValBin {
                                       num_data_, 1024, &n_block, &block_size);
     std::vector<INDEX_T> sizes(t_data_.size() + 1, 0);
     const int pre_alloc_size = 50;
-#pragma omp parallel for schedule(static, 1)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 1)
     for (int tid = 0; tid < n_block; ++tid) {
       data_size_t start = tid * block_size;
       data_size_t end = std::min(num_data_, start + block_size);
diff --git a/src/io/train_share_states.cpp b/src/io/train_share_states.cpp
index 71b2e097ef1b..4f2ec456c937 100644
--- a/src/io/train_share_states.cpp
+++ b/src/io/train_share_states.cpp
@@ -56,7 +56,7 @@ void MultiValBinWrapper::HistMove(const std::vector<hist_t,
     if (HIST_BITS == 32) {
       const int64_t* src = reinterpret_cast<const int64_t*>(hist_buf.data()) + hist_buf.size() / 2 -
         static_cast<size_t>(num_bin_aligned_);
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for schedule(static) num_threads(num_threads_)
       for (int i = 0; i < static_cast<int>(hist_move_src_.size()); ++i) {
         std::copy_n(src + hist_move_src_[i] / 2, hist_move_size_[i] / 2,
                     reinterpret_cast<int64_t*>(origin_hist_data_) + hist_move_dest_[i] / 2);
@@ -65,14 +65,14 @@ void MultiValBinWrapper::HistMove(const std::vector<hist_t,
       const int32_t* src = reinterpret_cast<const int32_t*>(hist_buf.data()) + hist_buf.size() / 2 -
         static_cast<size_t>(num_bin_aligned_);
       if (is_use_subcol_) {
-        #pragma omp parallel for schedule(static)
+        #pragma omp parallel for schedule(static) num_threads(num_threads_)
         for (int i = 0; i < static_cast<int>(hist_move_src_.size()); ++i) {
           std::copy_n(src + hist_move_src_[i] / 2, hist_move_size_[i] / 2,
                       reinterpret_cast<int32_t*>(origin_hist_data_) + hist_move_dest_[i] / 2);
         }
       } else {
         int32_t* orig_ptr = reinterpret_cast<int32_t*>(origin_hist_data_);
-        #pragma omp parallel for schedule(static)
+        #pragma omp parallel for schedule(static) num_threads(num_threads_)
         for (int i = 0; i < num_bin_; ++i) {
           orig_ptr[i] = src[i];
         }
@@ -81,7 +81,7 @@ void MultiValBinWrapper::HistMove(const std::vector<hist_t,
   } else {
     const hist_t* src = hist_buf.data() + hist_buf.size() -
       2 * static_cast<size_t>(num_bin_aligned_);
-    #pragma omp parallel for schedule(static)
+    #pragma omp parallel for schedule(static) num_threads(num_threads_)
     for (int i = 0; i < static_cast<int>(hist_move_src_.size()); ++i) {
       std::copy_n(src + hist_move_src_[i], hist_move_size_[i],
                   origin_hist_data_ + hist_move_dest_[i]);
diff --git a/src/io/tree.cpp b/src/io/tree.cpp
index ce45d20cf454..4312b4f65002 100644
--- a/src/io/tree.cpp
+++ b/src/io/tree.cpp
@@ -153,7 +153,7 @@ int Tree::SplitCategorical(int leaf, int feature, int real_feature, const uint32
 void Tree::AddPredictionToScore(const Dataset* data, data_size_t num_data, double* score) const {
   if (!is_linear_ && num_leaves_ <= 1) {
     if (leaf_value_[0] != 0.0f) {
-#pragma omp parallel for schedule(static, 512) if (num_data >= 1024)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_data >= 1024)
       for (data_size_t i = 0; i < num_data; ++i) {
         score[i] += leaf_value_[0];
       }
@@ -234,7 +234,7 @@ void Tree::AddPredictionToScore(const Dataset* data,
   data_size_t num_data, double* score) const {
   if (!is_linear_ && num_leaves_ <= 1) {
     if (leaf_value_[0] != 0.0f) {
-#pragma omp parallel for schedule(static, 512) if (num_data >= 1024)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_data >= 1024)
       for (data_size_t i = 0; i < num_data; ++i) {
         score[used_data_indices[i]] += leaf_value_[0];
       }
diff --git a/src/metric/binary_metric.hpp b/src/metric/binary_metric.hpp
index 037f54ba091a..2fc840268e38 100644
--- a/src/metric/binary_metric.hpp
+++ b/src/metric/binary_metric.hpp
@@ -61,13 +61,13 @@ class BinaryMetric: public Metric {
     double sum_loss = 0.0f;
     if (objective == nullptr) {
       if (weights_ == nullptr) {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
         for (data_size_t i = 0; i < num_data_; ++i) {
           // add loss
           sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], score[i]);
         }
       } else {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
         for (data_size_t i = 0; i < num_data_; ++i) {
           // add loss
           sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], score[i]) * weights_[i];
@@ -75,7 +75,7 @@ class BinaryMetric: public Metric {
       }
     } else {
       if (weights_ == nullptr) {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
         for (data_size_t i = 0; i < num_data_; ++i) {
           double prob = 0;
           objective->ConvertOutput(&score[i], &prob);
@@ -83,7 +83,7 @@ class BinaryMetric: public Metric {
           sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], prob);
         }
       } else {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
         for (data_size_t i = 0; i < num_data_; ++i) {
           double prob = 0;
           objective->ConvertOutput(&score[i], &prob);
diff --git a/src/metric/cuda/cuda_binary_metric.hpp b/src/metric/cuda/cuda_binary_metric.hpp
index 86dbdce986f5..0f61063c620e 100644
--- a/src/metric/cuda/cuda_binary_metric.hpp
+++ b/src/metric/cuda/cuda_binary_metric.hpp
@@ -10,7 +10,7 @@
 #ifdef USE_CUDA
 
 #include <LightGBM/cuda/cuda_metric.hpp>
-#include <LightGBM/cuda/cuda_utils.h>
+#include <LightGBM/cuda/cuda_utils.hu>
 
 #include <vector>
 
diff --git a/src/metric/cuda/cuda_pointwise_metric.hpp b/src/metric/cuda/cuda_pointwise_metric.hpp
index dae1c6a7fa68..fafeafe635c1 100644
--- a/src/metric/cuda/cuda_pointwise_metric.hpp
+++ b/src/metric/cuda/cuda_pointwise_metric.hpp
@@ -10,7 +10,7 @@
 #ifdef USE_CUDA
 
 #include <LightGBM/cuda/cuda_metric.hpp>
-#include <LightGBM/cuda/cuda_utils.h>
+#include <LightGBM/cuda/cuda_utils.hu>
 
 #include <vector>
 
diff --git a/src/metric/cuda/cuda_regression_metric.hpp b/src/metric/cuda/cuda_regression_metric.hpp
index 4cfd996d837d..e69bd221582c 100644
--- a/src/metric/cuda/cuda_regression_metric.hpp
+++ b/src/metric/cuda/cuda_regression_metric.hpp
@@ -10,7 +10,7 @@
 #ifdef USE_CUDA
 
 #include <LightGBM/cuda/cuda_metric.hpp>
-#include <LightGBM/cuda/cuda_utils.h>
+#include <LightGBM/cuda/cuda_utils.hu>
 
 #include <vector>
 
diff --git a/src/metric/map_metric.hpp b/src/metric/map_metric.hpp
index 18539ee44ee0..76fb4fe8a3e9 100644
--- a/src/metric/map_metric.hpp
+++ b/src/metric/map_metric.hpp
@@ -111,7 +111,7 @@ class MapMetric:public Metric {
     }
     std::vector<double> tmp_map(eval_at_.size(), 0.0f);
     if (query_weights_ == nullptr) {
-      #pragma omp parallel for schedule(guided) firstprivate(tmp_map)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(guided) firstprivate(tmp_map)
       for (data_size_t i = 0; i < num_queries_; ++i) {
         const int tid = omp_get_thread_num();
         CalMapAtK(eval_at_, npos_per_query_[i], label_ + query_boundaries_[i],
@@ -121,7 +121,7 @@ class MapMetric:public Metric {
         }
       }
     } else {
-      #pragma omp parallel for schedule(guided) firstprivate(tmp_map)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(guided) firstprivate(tmp_map)
       for (data_size_t i = 0; i < num_queries_; ++i) {
         const int tid = omp_get_thread_num();
         CalMapAtK(eval_at_, npos_per_query_[i], label_ + query_boundaries_[i],
diff --git a/src/metric/multiclass_metric.hpp b/src/metric/multiclass_metric.hpp
index c83b2c842790..dbc769be2e3e 100644
--- a/src/metric/multiclass_metric.hpp
+++ b/src/metric/multiclass_metric.hpp
@@ -63,7 +63,7 @@ class MulticlassMetric: public Metric {
     }
     if (objective != nullptr) {
       if (weights_ == nullptr) {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
         for (data_size_t i = 0; i < num_data_; ++i) {
           std::vector<double> raw_score(num_tree_per_iteration);
           for (int k = 0; k < num_tree_per_iteration; ++k) {
@@ -76,7 +76,7 @@ class MulticlassMetric: public Metric {
           sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], &rec, config_);
         }
       } else {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
         for (data_size_t i = 0; i < num_data_; ++i) {
           std::vector<double> raw_score(num_tree_per_iteration);
           for (int k = 0; k < num_tree_per_iteration; ++k) {
@@ -91,7 +91,7 @@ class MulticlassMetric: public Metric {
       }
     } else {
       if (weights_ == nullptr) {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
         for (data_size_t i = 0; i < num_data_; ++i) {
           std::vector<double> rec(num_tree_per_iteration);
           for (int k = 0; k < num_tree_per_iteration; ++k) {
@@ -102,7 +102,7 @@ class MulticlassMetric: public Metric {
           sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], &rec, config_);
         }
       } else {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
         for (data_size_t i = 0; i < num_data_; ++i) {
           std::vector<double> rec(num_tree_per_iteration);
           for (int k = 0; k < num_tree_per_iteration; ++k) {
diff --git a/src/metric/rank_metric.hpp b/src/metric/rank_metric.hpp
index 888849950be3..e2adb8c082d4 100644
--- a/src/metric/rank_metric.hpp
+++ b/src/metric/rank_metric.hpp
@@ -57,7 +57,7 @@ class NDCGMetric:public Metric {
     }
     inverse_max_dcgs_.resize(num_queries_);
     // cache the inverse max DCG for all queries, used to calculate NDCG
-    #pragma omp parallel for schedule(static)
+    #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
     for (data_size_t i = 0; i < num_queries_; ++i) {
       inverse_max_dcgs_[i].resize(eval_at_.size(), 0.0f);
       DCGCalculator::CalMaxDCG(eval_at_, label_ + query_boundaries_[i],
@@ -92,7 +92,7 @@ class NDCGMetric:public Metric {
     }
     std::vector<double> tmp_dcg(eval_at_.size(), 0.0f);
     if (query_weights_ == nullptr) {
-      #pragma omp parallel for schedule(static) firstprivate(tmp_dcg)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) firstprivate(tmp_dcg)
       for (data_size_t i = 0; i < num_queries_; ++i) {
         const int tid = omp_get_thread_num();
         // if all doc in this query are all negative, let its NDCG=1
@@ -112,7 +112,7 @@ class NDCGMetric:public Metric {
         }
       }
     } else {
-      #pragma omp parallel for schedule(static) firstprivate(tmp_dcg)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) firstprivate(tmp_dcg)
       for (data_size_t i = 0; i < num_queries_; ++i) {
         const int tid = omp_get_thread_num();
         // if all doc in this query are all negative, let its NDCG=1
diff --git a/src/metric/regression_metric.hpp b/src/metric/regression_metric.hpp
index 3c4124aad4b9..fbe6f2a062fb 100644
--- a/src/metric/regression_metric.hpp
+++ b/src/metric/regression_metric.hpp
@@ -59,13 +59,13 @@ class RegressionMetric: public Metric {
     double sum_loss = 0.0f;
     if (objective == nullptr) {
       if (weights_ == nullptr) {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
         for (data_size_t i = 0; i < num_data_; ++i) {
           // add loss
           sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], score[i], config_);
         }
       } else {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
         for (data_size_t i = 0; i < num_data_; ++i) {
           // add loss
           sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], score[i], config_) * weights_[i];
@@ -73,7 +73,7 @@ class RegressionMetric: public Metric {
       }
     } else {
       if (weights_ == nullptr) {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
         for (data_size_t i = 0; i < num_data_; ++i) {
           // add loss
           double t = 0;
@@ -81,7 +81,7 @@ class RegressionMetric: public Metric {
           sum_loss += PointWiseLossCalculator::LossOnPoint(label_[i], t, config_);
         }
       } else {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
         for (data_size_t i = 0; i < num_data_; ++i) {
           // add loss
           double t = 0;
diff --git a/src/metric/xentropy_metric.hpp b/src/metric/xentropy_metric.hpp
index 29d4984c64b3..9c7671c7546d 100644
--- a/src/metric/xentropy_metric.hpp
+++ b/src/metric/xentropy_metric.hpp
@@ -107,26 +107,26 @@ class CrossEntropyMetric : public Metric {
     double sum_loss = 0.0f;
     if (objective == nullptr) {
       if (weights_ == nullptr) {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
         for (data_size_t i = 0; i < num_data_; ++i) {
           sum_loss += XentLoss(label_[i], score[i]);  // NOTE: does not work unless score is a probability
         }
       } else {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
         for (data_size_t i = 0; i < num_data_; ++i) {
           sum_loss += XentLoss(label_[i], score[i]) * weights_[i];  // NOTE: does not work unless score is a probability
         }
       }
     } else {
       if (weights_ == nullptr) {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
         for (data_size_t i = 0; i < num_data_; ++i) {
           double p = 0;
           objective->ConvertOutput(&score[i], &p);
           sum_loss += XentLoss(label_[i], p);
         }
       } else {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
         for (data_size_t i = 0; i < num_data_; ++i) {
           double p = 0;
           objective->ConvertOutput(&score[i], &p);
@@ -192,13 +192,13 @@ class CrossEntropyLambdaMetric : public Metric {
     double sum_loss = 0.0f;
     if (objective == nullptr) {
       if (weights_ == nullptr) {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
         for (data_size_t i = 0; i < num_data_; ++i) {
           double hhat = std::log1p(std::exp(score[i]));  // auto-convert
           sum_loss += XentLambdaLoss(label_[i], 1.0f, hhat);
         }
       } else {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
         for (data_size_t i = 0; i < num_data_; ++i) {
           double hhat = std::log1p(std::exp(score[i]));  // auto-convert
           sum_loss += XentLambdaLoss(label_[i], weights_[i], hhat);
@@ -206,14 +206,14 @@ class CrossEntropyLambdaMetric : public Metric {
       }
     } else {
       if (weights_ == nullptr) {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
         for (data_size_t i = 0; i < num_data_; ++i) {
           double hhat = 0;
           objective->ConvertOutput(&score[i], &hhat);  // NOTE: this only works if objective = "xentlambda"
           sum_loss += XentLambdaLoss(label_[i], 1.0f, hhat);
         }
       } else {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
         for (data_size_t i = 0; i < num_data_; ++i) {
           double hhat = 0;
           objective->ConvertOutput(&score[i], &hhat);  // NOTE: this only works if objective = "xentlambda"
@@ -299,26 +299,26 @@ class KullbackLeiblerDivergence : public Metric {
     double sum_loss = 0.0f;
     if (objective == nullptr) {
       if (weights_ == nullptr) {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
         for (data_size_t i = 0; i < num_data_; ++i) {
           sum_loss += XentLoss(label_[i], score[i]);  // NOTE: does not work unless score is a probability
         }
       } else {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
         for (data_size_t i = 0; i < num_data_; ++i) {
           sum_loss += XentLoss(label_[i], score[i]) * weights_[i];  // NOTE: does not work unless score is a probability
         }
       }
     } else {
       if (weights_ == nullptr) {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
         for (data_size_t i = 0; i < num_data_; ++i) {
           double p = 0;
           objective->ConvertOutput(&score[i], &p);
           sum_loss += XentLoss(label_[i], p);
         }
       } else {
-        #pragma omp parallel for schedule(static) reduction(+:sum_loss)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_loss)
         for (data_size_t i = 0; i < num_data_; ++i) {
           double p = 0;
           objective->ConvertOutput(&score[i], &p);
diff --git a/src/objective/binary_objective.hpp b/src/objective/binary_objective.hpp
index b770972db9a1..b189cc7701c4 100644
--- a/src/objective/binary_objective.hpp
+++ b/src/objective/binary_objective.hpp
@@ -63,7 +63,7 @@ class BinaryLogloss: public ObjectiveFunction {
     data_size_t cnt_positive = 0;
     data_size_t cnt_negative = 0;
     // count for positive and negative samples
-    #pragma omp parallel for schedule(static) reduction(+:cnt_positive, cnt_negative)
+    #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:cnt_positive, cnt_negative)
     for (data_size_t i = 0; i < num_data_; ++i) {
       if (is_pos_(label_[i])) {
         ++cnt_positive;
@@ -107,7 +107,7 @@ class BinaryLogloss: public ObjectiveFunction {
       return;
     }
     if (weights_ == nullptr) {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
       for (data_size_t i = 0; i < num_data_; ++i) {
         // get label and label weights
         const int is_pos = is_pos_(label_[i]);
@@ -120,7 +120,7 @@ class BinaryLogloss: public ObjectiveFunction {
         hessians[i] = static_cast<score_t>(abs_response * (sigmoid_ - abs_response) * label_weight);
       }
     } else {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
       for (data_size_t i = 0; i < num_data_; ++i) {
         // get label and label weights
         const int is_pos = is_pos_(label_[i]);
@@ -140,14 +140,14 @@ class BinaryLogloss: public ObjectiveFunction {
     double suml = 0.0f;
     double sumw = 0.0f;
     if (weights_ != nullptr) {
-      #pragma omp parallel for schedule(static) reduction(+:suml, sumw) if (!deterministic_)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:suml, sumw) if (!deterministic_)
       for (data_size_t i = 0; i < num_data_; ++i) {
         suml += is_pos_(label_[i]) * weights_[i];
         sumw += weights_[i];
       }
     } else {
       sumw = static_cast<double>(num_data_);
-      #pragma omp parallel for schedule(static) reduction(+:suml) if (!deterministic_)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:suml) if (!deterministic_)
       for (data_size_t i = 0; i < num_data_; ++i) {
         suml += is_pos_(label_[i]);
       }
diff --git a/src/objective/multiclass_objective.hpp b/src/objective/multiclass_objective.hpp
index 1d9c8da17556..b2a49a9a40ca 100644
--- a/src/objective/multiclass_objective.hpp
+++ b/src/objective/multiclass_objective.hpp
@@ -86,7 +86,7 @@ class MulticlassSoftmax: public ObjectiveFunction {
   void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override {
     if (weights_ == nullptr) {
       std::vector<double> rec;
-      #pragma omp parallel for schedule(static) private(rec)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) private(rec)
       for (data_size_t i = 0; i < num_data_; ++i) {
         rec.resize(num_class_);
         for (int k = 0; k < num_class_; ++k) {
@@ -107,7 +107,7 @@ class MulticlassSoftmax: public ObjectiveFunction {
       }
     } else {
       std::vector<double> rec;
-      #pragma omp parallel for schedule(static) private(rec)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) private(rec)
       for (data_size_t i = 0; i < num_data_; ++i) {
         rec.resize(num_class_);
         for (int k = 0; k < num_class_; ++k) {
diff --git a/src/objective/rank_objective.hpp b/src/objective/rank_objective.hpp
index 6bd5324812f8..ae3b74651759 100644
--- a/src/objective/rank_objective.hpp
+++ b/src/objective/rank_objective.hpp
@@ -58,7 +58,7 @@ class RankingObjective : public ObjectiveFunction {
 
   void GetGradients(const double* score, score_t* gradients,
                     score_t* hessians) const override {
-#pragma omp parallel for schedule(guided)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(guided)
     for (data_size_t i = 0; i < num_queries_; ++i) {
       const data_size_t start = query_boundaries_[i];
       const data_size_t cnt = query_boundaries_[i + 1] - query_boundaries_[i];
@@ -157,7 +157,7 @@ class LambdarankNDCG : public RankingObjective {
     DCGCalculator::CheckMetadata(metadata, num_queries_);
     DCGCalculator::CheckLabel(label_, num_data_);
     inverse_max_dcgs_.resize(num_queries_);
-#pragma omp parallel for schedule(static)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
     for (data_size_t i = 0; i < num_queries_; ++i) {
       inverse_max_dcgs_[i] = DCGCalculator::CalMaxDCGAtK(
           truncation_level_, label_ + query_boundaries_[i],
@@ -289,17 +289,12 @@ class LambdarankNDCG : public RankingObjective {
 
   void UpdatePositionBiasFactors(const score_t* lambdas, const score_t* hessians) const override {
     /// get number of threads
-    int num_threads = 1;
-    #pragma omp parallel
-    #pragma omp master
-    {
-      num_threads = omp_get_num_threads();
-    }
+    int num_threads = OMP_NUM_THREADS();
     // create per-thread buffers for first and second derivatives of utility w.r.t. position bias factors
     std::vector<double> bias_first_derivatives(num_position_ids_ * num_threads, 0.0);
     std::vector<double> bias_second_derivatives(num_position_ids_ * num_threads, 0.0);
     std::vector<int> instance_counts(num_position_ids_ * num_threads, 0);
-    #pragma omp parallel for schedule(guided)
+    #pragma omp parallel for schedule(guided) num_threads(num_threads)
     for (data_size_t i = 0; i < num_data_; i++) {
       // get thread ID
       const int tid = omp_get_thread_num();
@@ -310,7 +305,7 @@ class LambdarankNDCG : public RankingObjective {
       bias_second_derivatives[offset] -= hessians[i];
       instance_counts[offset]++;
     }
-    #pragma omp parallel for schedule(guided)
+    #pragma omp parallel for schedule(guided) num_threads(num_threads)
     for (data_size_t i = 0; i < num_position_ids_; i++) {
       double bias_first_derivative = 0.0;
       double bias_second_derivative = 0.0;
diff --git a/src/objective/regression_objective.hpp b/src/objective/regression_objective.hpp
index eb149756c205..4f53319bbc49 100644
--- a/src/objective/regression_objective.hpp
+++ b/src/objective/regression_objective.hpp
@@ -115,7 +115,7 @@ class RegressionL2loss: public ObjectiveFunction {
     label_ = metadata.label();
     if (sqrt_) {
       trans_label_.resize(num_data_);
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
       for (data_size_t i = 0; i < num_data; ++i) {
         trans_label_[i] = Common::Sign(label_[i]) * std::sqrt(std::fabs(label_[i]));
       }
@@ -127,13 +127,13 @@ class RegressionL2loss: public ObjectiveFunction {
   void GetGradients(const double* score, score_t* gradients,
                     score_t* hessians) const override {
     if (weights_ == nullptr) {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
       for (data_size_t i = 0; i < num_data_; ++i) {
         gradients[i] = static_cast<score_t>(score[i] - label_[i]);
         hessians[i] = 1.0f;
       }
     } else {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
       for (data_size_t i = 0; i < num_data_; ++i) {
         gradients[i] = static_cast<score_t>(static_cast<score_t>((score[i] - label_[i])) * weights_[i]);
         hessians[i] = static_cast<score_t>(weights_[i]);
@@ -174,14 +174,14 @@ class RegressionL2loss: public ObjectiveFunction {
     double suml = 0.0f;
     double sumw = 0.0f;
     if (weights_ != nullptr) {
-      #pragma omp parallel for schedule(static) reduction(+:suml, sumw) if (!deterministic_)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:suml, sumw) if (!deterministic_)
       for (data_size_t i = 0; i < num_data_; ++i) {
         suml += static_cast<double>(label_[i]) * weights_[i];
         sumw += weights_[i];
       }
     } else {
       sumw = static_cast<double>(num_data_);
-      #pragma omp parallel for schedule(static) reduction(+:suml) if (!deterministic_)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:suml) if (!deterministic_)
       for (data_size_t i = 0; i < num_data_; ++i) {
         suml += label_[i];
       }
@@ -217,14 +217,14 @@ class RegressionL1loss: public RegressionL2loss {
   void GetGradients(const double* score, score_t* gradients,
                     score_t* hessians) const override {
     if (weights_ == nullptr) {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
       for (data_size_t i = 0; i < num_data_; ++i) {
         const double diff = score[i] - label_[i];
         gradients[i] = static_cast<score_t>(Common::Sign(diff));
         hessians[i] = 1.0f;
       }
     } else {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
       for (data_size_t i = 0; i < num_data_; ++i) {
         const double diff = score[i] - label_[i];
         gradients[i] = static_cast<score_t>(Common::Sign(diff) * weights_[i]);
@@ -313,7 +313,7 @@ class RegressionHuberLoss: public RegressionL2loss {
   void GetGradients(const double* score, score_t* gradients,
                     score_t* hessians) const override {
     if (weights_ == nullptr) {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
       for (data_size_t i = 0; i < num_data_; ++i) {
         const double diff = score[i] - label_[i];
         if (std::abs(diff) <= alpha_) {
@@ -324,7 +324,7 @@ class RegressionHuberLoss: public RegressionL2loss {
         hessians[i] = 1.0f;
       }
     } else {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
       for (data_size_t i = 0; i < num_data_; ++i) {
         const double diff = score[i] - label_[i];
         if (std::abs(diff) <= alpha_) {
@@ -362,14 +362,14 @@ class RegressionFairLoss: public RegressionL2loss {
   void GetGradients(const double* score, score_t* gradients,
                     score_t* hessians) const override {
     if (weights_ == nullptr) {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
       for (data_size_t i = 0; i < num_data_; ++i) {
         const double x = score[i] - label_[i];
         gradients[i] = static_cast<score_t>(c_ * x / (std::fabs(x) + c_));
         hessians[i] = static_cast<score_t>(c_ * c_ / ((std::fabs(x) + c_) * (std::fabs(x) + c_)));
       }
     } else {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
       for (data_size_t i = 0; i < num_data_; ++i) {
         const double x = score[i] - label_[i];
         gradients[i] = static_cast<score_t>(c_ * x / (std::fabs(x) + c_) * weights_[i]);
@@ -441,14 +441,14 @@ class RegressionPoissonLoss: public RegressionL2loss {
                     score_t* hessians) const override {
     double exp_max_delta_step_ = std::exp(max_delta_step_);
     if (weights_ == nullptr) {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
       for (data_size_t i = 0; i < num_data_; ++i) {
         double exp_score = std::exp(score[i]);
         gradients[i] = static_cast<score_t>(exp_score - label_[i]);
         hessians[i] = static_cast<score_t>(exp_score * exp_max_delta_step_);
       }
     } else {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
       for (data_size_t i = 0; i < num_data_; ++i) {
         double exp_score = std::exp(score[i]);
         gradients[i] = static_cast<score_t>((exp_score - label_[i]) * weights_[i]);
@@ -493,7 +493,7 @@ class RegressionQuantileloss : public RegressionL2loss {
   void GetGradients(const double* score, score_t* gradients,
                     score_t* hessians) const override {
     if (weights_ == nullptr) {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
       for (data_size_t i = 0; i < num_data_; ++i) {
         score_t delta = static_cast<score_t>(score[i] - label_[i]);
         if (delta >= 0) {
@@ -504,7 +504,7 @@ class RegressionQuantileloss : public RegressionL2loss {
         hessians[i] = 1.0f;
       }
     } else {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
       for (data_size_t i = 0; i < num_data_; ++i) {
         score_t delta = static_cast<score_t>(score[i] - label_[i]);
         if (delta >= 0) {
@@ -598,12 +598,12 @@ class RegressionMAPELOSS : public RegressionL1loss {
     }
     label_weight_.resize(num_data);
     if (weights_ == nullptr) {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
       for (data_size_t i = 0; i < num_data_; ++i) {
         label_weight_[i] = 1.0f / std::max(1.0f, std::fabs(label_[i]));
       }
     } else {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
       for (data_size_t i = 0; i < num_data_; ++i) {
         label_weight_[i] = 1.0f / std::max(1.0f, std::fabs(label_[i])) * weights_[i];
       }
@@ -613,14 +613,14 @@ class RegressionMAPELOSS : public RegressionL1loss {
   void GetGradients(const double* score, score_t* gradients,
                     score_t* hessians) const override {
     if (weights_ == nullptr) {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
       for (data_size_t i = 0; i < num_data_; ++i) {
         const double diff = score[i] - label_[i];
         gradients[i] = static_cast<score_t>(Common::Sign(diff) * label_weight_[i]);
         hessians[i] = 1.0f;
       }
     } else {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
       for (data_size_t i = 0; i < num_data_; ++i) {
         const double diff = score[i] - label_[i];
         gradients[i] = static_cast<score_t>(Common::Sign(diff) * label_weight_[i]);
@@ -690,14 +690,14 @@ class RegressionGammaLoss : public RegressionPoissonLoss {
   void GetGradients(const double* score, score_t* gradients,
                     score_t* hessians) const override {
     if (weights_ == nullptr) {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
       for (data_size_t i = 0; i < num_data_; ++i) {
         double exp_score = std::exp(-score[i]);
         gradients[i] = static_cast<score_t>(1.0 - label_[i] * exp_score);
         hessians[i] = static_cast<score_t>(label_[i] * exp_score);
       }
     } else {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
       for (data_size_t i = 0; i < num_data_; ++i) {
         double exp_score = std::exp(-score[i]);
         gradients[i] = static_cast<score_t>((1.0 - label_[i] * exp_score) * weights_[i]);
@@ -728,7 +728,7 @@ class RegressionTweedieLoss: public RegressionPoissonLoss {
   void GetGradients(const double* score, score_t* gradients,
                     score_t* hessians) const override {
     if (weights_ == nullptr) {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
       for (data_size_t i = 0; i < num_data_; ++i) {
         double exp_1_score = std::exp((1 - rho_) * score[i]);
         double exp_2_score = std::exp((2 - rho_) * score[i]);
@@ -737,7 +737,7 @@ class RegressionTweedieLoss: public RegressionPoissonLoss {
           (2 - rho_) * exp_2_score);
       }
     } else {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
       for (data_size_t i = 0; i < num_data_; ++i) {
         double exp_1_score = std::exp((1 - rho_) * score[i]);
         double exp_2_score = std::exp((2 - rho_) * score[i]);
diff --git a/src/objective/xentropy_objective.hpp b/src/objective/xentropy_objective.hpp
index 513ccc1c2462..8edb0950609e 100644
--- a/src/objective/xentropy_objective.hpp
+++ b/src/objective/xentropy_objective.hpp
@@ -77,7 +77,7 @@ class CrossEntropy: public ObjectiveFunction {
   void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override {
     if (weights_ == nullptr) {
       // compute pointwise gradients and Hessians with implied unit weights
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
       for (data_size_t i = 0; i < num_data_; ++i) {
         const double z = 1.0f / (1.0f + std::exp(-score[i]));
         gradients[i] = static_cast<score_t>(z - label_[i]);
@@ -85,7 +85,7 @@ class CrossEntropy: public ObjectiveFunction {
       }
     } else {
       // compute pointwise gradients and Hessians with given weights
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
       for (data_size_t i = 0; i < num_data_; ++i) {
         const double z = 1.0f / (1.0f + std::exp(-score[i]));
         gradients[i] = static_cast<score_t>((z - label_[i]) * weights_[i]);
@@ -114,7 +114,7 @@ class CrossEntropy: public ObjectiveFunction {
     double suml = 0.0f;
     double sumw = 0.0f;
     if (weights_ != nullptr) {
-      #pragma omp parallel for schedule(static) reduction(+:suml, sumw) if (!deterministic_)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:suml, sumw) if (!deterministic_)
 
       for (data_size_t i = 0; i < num_data_; ++i) {
         suml += static_cast<double>(label_[i]) * weights_[i];
@@ -122,7 +122,7 @@ class CrossEntropy: public ObjectiveFunction {
       }
     } else {
       sumw = static_cast<double>(num_data_);
-      #pragma omp parallel for schedule(static) reduction(+:suml) if (!deterministic_)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:suml) if (!deterministic_)
 
       for (data_size_t i = 0; i < num_data_; ++i) {
         suml += label_[i];
@@ -190,7 +190,7 @@ class CrossEntropyLambda: public ObjectiveFunction {
   void GetGradients(const double* score, score_t* gradients, score_t* hessians) const override {
     if (weights_ == nullptr) {
       // compute pointwise gradients and Hessians with implied unit weights; exactly equivalent to CrossEntropy with unit weights
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
       for (data_size_t i = 0; i < num_data_; ++i) {
         const double z = 1.0f / (1.0f + std::exp(-score[i]));
         gradients[i] = static_cast<score_t>(z - label_[i]);
@@ -198,7 +198,7 @@ class CrossEntropyLambda: public ObjectiveFunction {
       }
     } else {
       // compute pointwise gradients and Hessians with given weights
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
       for (data_size_t i = 0; i < num_data_; ++i) {
         const double w = weights_[i];
         const double y = label_[i];
@@ -244,7 +244,7 @@ class CrossEntropyLambda: public ObjectiveFunction {
     double suml = 0.0f;
     double sumw = 0.0f;
     if (weights_ != nullptr) {
-      #pragma omp parallel for schedule(static) reduction(+:suml, sumw) if (!deterministic_)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:suml, sumw) if (!deterministic_)
 
       for (data_size_t i = 0; i < num_data_; ++i) {
         suml += static_cast<double>(label_[i]) * weights_[i];
@@ -252,7 +252,7 @@ class CrossEntropyLambda: public ObjectiveFunction {
       }
     } else {
       sumw = static_cast<double>(num_data_);
-      #pragma omp parallel for schedule(static) reduction(+:suml) if (!deterministic_)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:suml) if (!deterministic_)
 
       for (data_size_t i = 0; i < num_data_; ++i) {
         suml += label_[i];
diff --git a/src/treelearner/col_sampler.hpp b/src/treelearner/col_sampler.hpp
index 6debe9db60ca..c70b07e50efa 100644
--- a/src/treelearner/col_sampler.hpp
+++ b/src/treelearner/col_sampler.hpp
@@ -79,7 +79,7 @@ class ColSampler {
           static_cast<int>(valid_feature_indices_.size()), used_cnt_bytree_);
       int omp_loop_size = static_cast<int>(used_feature_indices_.size());
 
-#pragma omp parallel for schedule(static, 512) if (omp_loop_size >= 1024)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (omp_loop_size >= 1024)
       for (int i = 0; i < omp_loop_size; ++i) {
         int used_feature = valid_feature_indices_[used_feature_indices_[i]];
         int inner_feature_index = train_data_->InnerFeatureIndex(used_feature);
@@ -142,7 +142,7 @@ class ColSampler {
       auto sampled_indices = random_.Sample(
           static_cast<int>((*allowed_used_feature_indices).size()), used_feature_cnt);
       int omp_loop_size = static_cast<int>(sampled_indices.size());
-#pragma omp parallel for schedule(static, 512) if (omp_loop_size >= 1024)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (omp_loop_size >= 1024)
       for (int i = 0; i < omp_loop_size; ++i) {
         int used_feature =
             valid_feature_indices_[(*allowed_used_feature_indices)[sampled_indices[i]]];
@@ -168,7 +168,7 @@ class ColSampler {
       auto sampled_indices = random_.Sample(
           static_cast<int>((*allowed_valid_feature_indices).size()), used_feature_cnt);
       int omp_loop_size = static_cast<int>(sampled_indices.size());
-#pragma omp parallel for schedule(static, 512) if (omp_loop_size >= 1024)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (omp_loop_size >= 1024)
       for (int i = 0; i < omp_loop_size; ++i) {
         int used_feature = (*allowed_valid_feature_indices)[sampled_indices[i]];
         int inner_feature_index = train_data_->InnerFeatureIndex(used_feature);
diff --git a/src/treelearner/cuda/cuda_best_split_finder.cpp b/src/treelearner/cuda/cuda_best_split_finder.cpp
index 761b62f21c2e..95758542849c 100644
--- a/src/treelearner/cuda/cuda_best_split_finder.cpp
+++ b/src/treelearner/cuda/cuda_best_split_finder.cpp
@@ -40,6 +40,9 @@ CUDABestSplitFinder::CUDABestSplitFinder(
   select_features_by_node_(select_features_by_node),
   cuda_hist_(cuda_hist) {
   InitFeatureMetaInfo(train_data);
+  if (has_categorical_feature_ && config->use_quantized_grad) {
+    Log::Fatal("Quantized training on GPU with categorical features is not supported yet.");
+  }
   cuda_leaf_best_split_info_ = nullptr;
   cuda_best_split_info_ = nullptr;
   cuda_best_split_info_buffer_ = nullptr;
@@ -326,13 +329,23 @@ void CUDABestSplitFinder::FindBestSplitsForLeaf(
   const data_size_t num_data_in_smaller_leaf,
   const data_size_t num_data_in_larger_leaf,
   const double sum_hessians_in_smaller_leaf,
-  const double sum_hessians_in_larger_leaf) {
+  const double sum_hessians_in_larger_leaf,
+  const score_t* grad_scale,
+  const score_t* hess_scale,
+  const uint8_t smaller_num_bits_in_histogram_bins,
+  const uint8_t larger_num_bits_in_histogram_bins) {
   const bool is_smaller_leaf_valid = (num_data_in_smaller_leaf > min_data_in_leaf_ &&
     sum_hessians_in_smaller_leaf > min_sum_hessian_in_leaf_);
   const bool is_larger_leaf_valid = (num_data_in_larger_leaf > min_data_in_leaf_ &&
     sum_hessians_in_larger_leaf > min_sum_hessian_in_leaf_ && larger_leaf_index >= 0);
-  LaunchFindBestSplitsForLeafKernel(smaller_leaf_splits, larger_leaf_splits,
-    smaller_leaf_index, larger_leaf_index, is_smaller_leaf_valid, is_larger_leaf_valid);
+  if (grad_scale != nullptr && hess_scale != nullptr) {
+    LaunchFindBestSplitsDiscretizedForLeafKernel(smaller_leaf_splits, larger_leaf_splits,
+      smaller_leaf_index, larger_leaf_index, is_smaller_leaf_valid, is_larger_leaf_valid,
+      grad_scale, hess_scale, smaller_num_bits_in_histogram_bins, larger_num_bits_in_histogram_bins);
+  } else {
+    LaunchFindBestSplitsForLeafKernel(smaller_leaf_splits, larger_leaf_splits,
+      smaller_leaf_index, larger_leaf_index, is_smaller_leaf_valid, is_larger_leaf_valid);
+  }
   global_timer.Start("CUDABestSplitFinder::LaunchSyncBestSplitForLeafKernel");
   LaunchSyncBestSplitForLeafKernel(smaller_leaf_index, larger_leaf_index, is_smaller_leaf_valid, is_larger_leaf_valid);
   SynchronizeCUDADevice(__FILE__, __LINE__);
diff --git a/src/treelearner/cuda/cuda_best_split_finder.cu b/src/treelearner/cuda/cuda_best_split_finder.cu
index 3fee5562953c..d5c819d392c9 100644
--- a/src/treelearner/cuda/cuda_best_split_finder.cu
+++ b/src/treelearner/cuda/cuda_best_split_finder.cu
@@ -320,6 +320,175 @@ __device__ void FindBestSplitsForLeafKernelInner(
   }
 }
 
+template <bool USE_RAND, bool USE_L1, bool USE_SMOOTHING, bool REVERSE, typename BIN_HIST_TYPE, typename ACC_HIST_TYPE, bool USE_16BIT_BIN_HIST, bool USE_16BIT_ACC_HIST>
+__device__ void FindBestSplitsDiscretizedForLeafKernelInner(
+  // input feature information
+  const BIN_HIST_TYPE* feature_hist_ptr,
+  // input task information
+  const SplitFindTask* task,
+  CUDARandom* cuda_random,
+  // input config parameter values
+  const double lambda_l1,
+  const double lambda_l2,
+  const double path_smooth,
+  const data_size_t min_data_in_leaf,
+  const double min_sum_hessian_in_leaf,
+  const double min_gain_to_split,
+  // input parent node information
+  const double parent_gain,
+  const int64_t sum_gradients_hessians,
+  const data_size_t num_data,
+  const double parent_output,
+  // gradient scale
+  const double grad_scale,
+  const double hess_scale,
+  // output parameters
+  CUDASplitInfo* cuda_best_split_info) {
+  const double sum_hessians = static_cast<double>(sum_gradients_hessians & 0x00000000ffffffff) * hess_scale;
+  const double cnt_factor = num_data / sum_hessians;
+  const double min_gain_shift = parent_gain + min_gain_to_split;
+
+  cuda_best_split_info->is_valid = false;
+
+  ACC_HIST_TYPE local_grad_hess_hist = 0;
+  double local_gain = 0.0f;
+  bool threshold_found = false;
+  uint32_t threshold_value = 0;
+  __shared__ int rand_threshold;
+  if (USE_RAND && threadIdx.x == 0) {
+    if (task->num_bin - 2 > 0) {
+      rand_threshold = cuda_random->NextInt(0, task->num_bin - 2);
+    }
+  }
+  __shared__ uint32_t best_thread_index;
+  __shared__ double shared_double_buffer[32];
+  __shared__ bool shared_bool_buffer[32];
+  __shared__ uint32_t shared_int_buffer[64];
+  const unsigned int threadIdx_x = threadIdx.x;
+  const bool skip_sum = REVERSE ?
+    (task->skip_default_bin && (task->num_bin - 1 - threadIdx_x) == static_cast<int>(task->default_bin)) :
+    (task->skip_default_bin && (threadIdx_x + task->mfb_offset) == static_cast<int>(task->default_bin));
+  const uint32_t feature_num_bin_minus_offset = task->num_bin - task->mfb_offset;
+  if (!REVERSE) {
+    if (threadIdx_x < feature_num_bin_minus_offset && !skip_sum) {
+      const unsigned int bin_offset = threadIdx_x;
+      if (USE_16BIT_BIN_HIST && !USE_16BIT_ACC_HIST) {
+        const int32_t local_grad_hess_hist_int32 = feature_hist_ptr[bin_offset];
+        local_grad_hess_hist = (static_cast<int64_t>(static_cast<int16_t>(local_grad_hess_hist_int32 >> 16)) << 32) | (static_cast<int64_t>(local_grad_hess_hist_int32 & 0x0000ffff));
+      } else {
+        local_grad_hess_hist = feature_hist_ptr[bin_offset];
+      }
+    }
+  } else {
+    if (threadIdx_x >= static_cast<unsigned int>(task->na_as_missing) &&
+      threadIdx_x < feature_num_bin_minus_offset && !skip_sum) {
+      const unsigned int read_index = feature_num_bin_minus_offset - 1 - threadIdx_x;
+      if (USE_16BIT_BIN_HIST && !USE_16BIT_ACC_HIST) {
+        const int32_t local_grad_hess_hist_int32 = feature_hist_ptr[read_index];
+        local_grad_hess_hist = (static_cast<int64_t>(static_cast<int16_t>(local_grad_hess_hist_int32 >> 16)) << 32) | (static_cast<int64_t>(local_grad_hess_hist_int32 & 0x0000ffff));
+      } else {
+        local_grad_hess_hist = feature_hist_ptr[read_index];
+      }
+    }
+  }
+  __syncthreads();
+  local_gain = kMinScore;
+  local_grad_hess_hist = ShufflePrefixSum<ACC_HIST_TYPE>(local_grad_hess_hist, reinterpret_cast<ACC_HIST_TYPE*>(shared_int_buffer));
+  double sum_left_gradient = 0.0f;
+  double sum_left_hessian = 0.0f;
+  double sum_right_gradient = 0.0f;
+  double sum_right_hessian = 0.0f;
+  data_size_t left_count = 0;
+  data_size_t right_count = 0;
+  int64_t sum_left_gradient_hessian = 0;
+  int64_t sum_right_gradient_hessian = 0;
+  if (REVERSE) {
+    if (threadIdx_x >= static_cast<unsigned int>(task->na_as_missing) && threadIdx_x <= task->num_bin - 2 && !skip_sum) {
+      sum_right_gradient_hessian = USE_16BIT_ACC_HIST ?
+        (static_cast<int64_t>(static_cast<int16_t>(local_grad_hess_hist >> 16)) << 32) | static_cast<int64_t>(local_grad_hess_hist & 0x0000ffff) :
+        local_grad_hess_hist;
+      sum_right_gradient = static_cast<double>(static_cast<int32_t>((sum_right_gradient_hessian & 0xffffffff00000000) >> 32)) * grad_scale;
+      sum_right_hessian = static_cast<double>(static_cast<int32_t>(sum_right_gradient_hessian & 0x00000000ffffffff)) * hess_scale;
+      right_count = static_cast<data_size_t>(__double2int_rn(sum_right_hessian * cnt_factor));
+      sum_left_gradient_hessian = sum_gradients_hessians - sum_right_gradient_hessian;
+      sum_left_gradient = static_cast<double>(static_cast<int32_t>((sum_left_gradient_hessian & 0xffffffff00000000)>> 32)) * grad_scale;
+      sum_left_hessian = static_cast<double>(static_cast<int32_t>(sum_left_gradient_hessian & 0x00000000ffffffff)) * hess_scale;
+      left_count = num_data - right_count;
+      if (sum_left_hessian >= min_sum_hessian_in_leaf && left_count >= min_data_in_leaf &&
+        sum_right_hessian >= min_sum_hessian_in_leaf && right_count >= min_data_in_leaf &&
+        (!USE_RAND || static_cast<int>(task->num_bin - 2 - threadIdx_x) == rand_threshold)) {
+        double current_gain = CUDALeafSplits::GetSplitGains<USE_L1, USE_SMOOTHING>(
+          sum_left_gradient, sum_left_hessian + kEpsilon, sum_right_gradient,
+          sum_right_hessian + kEpsilon, lambda_l1,
+          lambda_l2, path_smooth, left_count, right_count, parent_output);
+        // gain with split is worse than without split
+        if (current_gain > min_gain_shift) {
+          local_gain = current_gain - min_gain_shift;
+          threshold_value = static_cast<uint32_t>(task->num_bin - 2 - threadIdx_x);
+          threshold_found = true;
+        }
+      }
+    }
+  } else {
+    if (threadIdx_x <= feature_num_bin_minus_offset - 2 && !skip_sum) {
+      sum_left_gradient_hessian = USE_16BIT_ACC_HIST ?
+        (static_cast<int64_t>(static_cast<int16_t>(local_grad_hess_hist >> 16)) << 32) | static_cast<int64_t>(local_grad_hess_hist & 0x0000ffff) :
+        local_grad_hess_hist;
+      sum_left_gradient = static_cast<double>(static_cast<int32_t>((sum_left_gradient_hessian & 0xffffffff00000000) >> 32)) * grad_scale;
+      sum_left_hessian = static_cast<double>(static_cast<int32_t>(sum_left_gradient_hessian & 0x00000000ffffffff)) * hess_scale;
+      left_count = static_cast<data_size_t>(__double2int_rn(sum_left_hessian * cnt_factor));
+      sum_right_gradient_hessian = sum_gradients_hessians - sum_left_gradient_hessian;
+      sum_right_gradient = static_cast<double>(static_cast<int32_t>((sum_right_gradient_hessian & 0xffffffff00000000) >> 32)) * grad_scale;
+      sum_right_hessian = static_cast<double>(static_cast<int32_t>(sum_right_gradient_hessian & 0x00000000ffffffff)) * hess_scale;
+      right_count = num_data - left_count;
+      if (sum_left_hessian >= min_sum_hessian_in_leaf && left_count >= min_data_in_leaf &&
+        sum_right_hessian >= min_sum_hessian_in_leaf && right_count >= min_data_in_leaf &&
+        (!USE_RAND || static_cast<int>(threadIdx_x + task->mfb_offset) == rand_threshold)) {
+        double current_gain = CUDALeafSplits::GetSplitGains<USE_L1, USE_SMOOTHING>(
+          sum_left_gradient, sum_left_hessian + kEpsilon, sum_right_gradient,
+          sum_right_hessian + kEpsilon, lambda_l1,
+          lambda_l2, path_smooth, left_count, right_count, parent_output);
+        // gain with split is worse than without split
+        if (current_gain > min_gain_shift) {
+          local_gain = current_gain - min_gain_shift;
+          threshold_value = static_cast<uint32_t>(threadIdx_x + task->mfb_offset);
+          threshold_found = true;
+        }
+      }
+    }
+  }
+  __syncthreads();
+  const uint32_t result = ReduceBestGain(local_gain, threshold_found, threadIdx_x, shared_double_buffer, shared_bool_buffer, shared_int_buffer);
+  if (threadIdx_x == 0) {
+    best_thread_index = result;
+  }
+  __syncthreads();
+  if (threshold_found && threadIdx_x == best_thread_index) {
+    cuda_best_split_info->is_valid = true;
+    cuda_best_split_info->threshold = threshold_value;
+    cuda_best_split_info->gain = local_gain;
+    cuda_best_split_info->default_left = task->assume_out_default_left;
+    const double left_output = CUDALeafSplits::CalculateSplittedLeafOutput<USE_L1, USE_SMOOTHING>(sum_left_gradient,
+      sum_left_hessian, lambda_l1, lambda_l2, path_smooth, left_count, parent_output);
+    const double right_output = CUDALeafSplits::CalculateSplittedLeafOutput<USE_L1, USE_SMOOTHING>(sum_right_gradient,
+      sum_right_hessian, lambda_l1, lambda_l2, path_smooth, right_count, parent_output);
+    cuda_best_split_info->left_sum_gradients = sum_left_gradient;
+    cuda_best_split_info->left_sum_hessians = sum_left_hessian;
+    cuda_best_split_info->left_sum_of_gradients_hessians = sum_left_gradient_hessian;
+    cuda_best_split_info->left_count = left_count;
+    cuda_best_split_info->right_sum_gradients = sum_right_gradient;
+    cuda_best_split_info->right_sum_hessians = sum_right_hessian;
+    cuda_best_split_info->right_sum_of_gradients_hessians = sum_right_gradient_hessian;
+    cuda_best_split_info->right_count = right_count;
+    cuda_best_split_info->left_value = left_output;
+    cuda_best_split_info->left_gain = CUDALeafSplits::GetLeafGainGivenOutput<USE_L1>(sum_left_gradient,
+      sum_left_hessian, lambda_l1, lambda_l2, left_output);
+    cuda_best_split_info->right_value = right_output;
+    cuda_best_split_info->right_gain = CUDALeafSplits::GetLeafGainGivenOutput<USE_L1>(sum_right_gradient,
+      sum_right_hessian, lambda_l1, lambda_l2, right_output);
+  }
+}
+
 template <bool USE_RAND, bool USE_L1, bool USE_SMOOTHING>
 __device__ void FindBestSplitsForLeafKernelCategoricalInner(
   // input feature information
@@ -715,6 +884,169 @@ __global__ void FindBestSplitsForLeafKernel(
   }
 }
 
+
+template <bool USE_RAND, bool USE_L1, bool USE_SMOOTHING, bool IS_LARGER>
+__global__ void FindBestSplitsDiscretizedForLeafKernel(
+  // input feature information
+  const int8_t* is_feature_used_bytree,
+  // input task information
+  const int num_tasks,
+  const SplitFindTask* tasks,
+  CUDARandom* cuda_randoms,
+  // input leaf information
+  const CUDALeafSplitsStruct* smaller_leaf_splits,
+  const CUDALeafSplitsStruct* larger_leaf_splits,
+  const uint8_t smaller_leaf_num_bits_in_histogram_bin,
+  const uint8_t larger_leaf_num_bits_in_histogram_bin,
+  // input config parameter values
+  const data_size_t min_data_in_leaf,
+  const double min_sum_hessian_in_leaf,
+  const double min_gain_to_split,
+  const double lambda_l1,
+  const double lambda_l2,
+  const double path_smooth,
+  const double cat_smooth,
+  const double cat_l2,
+  const int max_cat_threshold,
+  const int min_data_per_group,
+  const int max_cat_to_onehot,
+  // gradient scale
+  const score_t* grad_scale,
+  const score_t* hess_scale,
+  // output
+  CUDASplitInfo* cuda_best_split_info) {
+  const unsigned int task_index = blockIdx.x;
+  const SplitFindTask* task = tasks + task_index;
+  const int inner_feature_index = task->inner_feature_index;
+  const double parent_gain = IS_LARGER ? larger_leaf_splits->gain : smaller_leaf_splits->gain;
+  const int64_t sum_gradients_hessians = IS_LARGER ? larger_leaf_splits->sum_of_gradients_hessians : smaller_leaf_splits->sum_of_gradients_hessians;
+  const data_size_t num_data = IS_LARGER ? larger_leaf_splits->num_data_in_leaf : smaller_leaf_splits->num_data_in_leaf;
+  const double parent_output = IS_LARGER ? larger_leaf_splits->leaf_value : smaller_leaf_splits->leaf_value;
+  const unsigned int output_offset = IS_LARGER ? (task_index + num_tasks) : task_index;
+  CUDASplitInfo* out = cuda_best_split_info + output_offset;
+  CUDARandom* cuda_random = USE_RAND ?
+    (IS_LARGER ? cuda_randoms + task_index * 2 + 1 : cuda_randoms + task_index * 2) : nullptr;
+  const bool use_16bit_bin = IS_LARGER ? (larger_leaf_num_bits_in_histogram_bin <= 16) : (smaller_leaf_num_bits_in_histogram_bin <= 16);
+  if (is_feature_used_bytree[inner_feature_index]) {
+    if (task->is_categorical) {
+      __threadfence();  // ensure store issued before trap
+      asm("trap;");
+    } else {
+      if (!task->reverse) {
+        if (use_16bit_bin) {
+          const int32_t* hist_ptr =
+            reinterpret_cast<const int32_t*>(IS_LARGER ? larger_leaf_splits->hist_in_leaf : smaller_leaf_splits->hist_in_leaf) + task->hist_offset;
+          FindBestSplitsDiscretizedForLeafKernelInner<USE_RAND, USE_L1, USE_SMOOTHING, false, int32_t, int32_t, true, true>(
+            // input feature information
+            hist_ptr,
+            // input task information
+            task,
+            cuda_random,
+            // input config parameter values
+            lambda_l1,
+            lambda_l2,
+            path_smooth,
+            min_data_in_leaf,
+            min_sum_hessian_in_leaf,
+            min_gain_to_split,
+            // input parent node information
+            parent_gain,
+            sum_gradients_hessians,
+            num_data,
+            parent_output,
+            // gradient scale
+            *grad_scale,
+            *hess_scale,
+            // output parameters
+            out);
+        } else {
+          const int32_t* hist_ptr =
+            reinterpret_cast<const int32_t*>(IS_LARGER ? larger_leaf_splits->hist_in_leaf : smaller_leaf_splits->hist_in_leaf) + task->hist_offset;
+          FindBestSplitsDiscretizedForLeafKernelInner<USE_RAND, USE_L1, USE_SMOOTHING, false, int32_t, int64_t, false, false>(
+            // input feature information
+            hist_ptr,
+            // input task information
+            task,
+            cuda_random,
+            // input config parameter values
+            lambda_l1,
+            lambda_l2,
+            path_smooth,
+            min_data_in_leaf,
+            min_sum_hessian_in_leaf,
+            min_gain_to_split,
+            // input parent node information
+            parent_gain,
+            sum_gradients_hessians,
+            num_data,
+            parent_output,
+            // gradient scale
+            *grad_scale,
+            *hess_scale,
+            // output parameters
+            out);
+        }
+      } else {
+        if (use_16bit_bin) {
+          const int32_t* hist_ptr =
+            reinterpret_cast<const int32_t*>(IS_LARGER ? larger_leaf_splits->hist_in_leaf : smaller_leaf_splits->hist_in_leaf) + task->hist_offset;
+          FindBestSplitsDiscretizedForLeafKernelInner<USE_RAND, USE_L1, USE_SMOOTHING, true, int32_t, int32_t, true, true>(
+            // input feature information
+            hist_ptr,
+            // input task information
+            task,
+            cuda_random,
+            // input config parameter values
+            lambda_l1,
+            lambda_l2,
+            path_smooth,
+            min_data_in_leaf,
+            min_sum_hessian_in_leaf,
+            min_gain_to_split,
+            // input parent node information
+            parent_gain,
+            sum_gradients_hessians,
+            num_data,
+            parent_output,
+            // gradient scale
+            *grad_scale,
+            *hess_scale,
+            // output parameters
+            out);
+        } else {
+          const int32_t* hist_ptr =
+            reinterpret_cast<const int32_t*>(IS_LARGER ? larger_leaf_splits->hist_in_leaf : smaller_leaf_splits->hist_in_leaf) + task->hist_offset;
+          FindBestSplitsDiscretizedForLeafKernelInner<USE_RAND, USE_L1, USE_SMOOTHING, true, int32_t, int64_t, false, false>(
+            // input feature information
+            hist_ptr,
+            // input task information
+            task,
+            cuda_random,
+            // input config parameter values
+            lambda_l1,
+            lambda_l2,
+            path_smooth,
+            min_data_in_leaf,
+            min_sum_hessian_in_leaf,
+            min_gain_to_split,
+            // input parent node information
+            parent_gain,
+            sum_gradients_hessians,
+            num_data,
+            parent_output,
+            // gradient scale
+            *grad_scale,
+            *hess_scale,
+            // output parameters
+            out);
+        }
+      }
+    }
+  } else {
+    out->is_valid = false;
+  }
+}
+
 template <bool USE_RAND, bool USE_L1, bool USE_SMOOTHING, bool REVERSE>
 __device__ void FindBestSplitsForLeafKernelInner_GlobalMemory(
   // input feature information
@@ -1466,6 +1798,108 @@ void CUDABestSplitFinder::LaunchFindBestSplitsForLeafKernelInner2(LaunchFindBest
 #undef FindBestSplitsForLeafKernel_ARGS
 #undef GlobalMemory_Buffer_ARGS
 
+
+#define LaunchFindBestSplitsDiscretizedForLeafKernel_PARAMS \
+  const CUDALeafSplitsStruct* smaller_leaf_splits, \
+  const CUDALeafSplitsStruct* larger_leaf_splits, \
+  const int smaller_leaf_index, \
+  const int larger_leaf_index, \
+  const bool is_smaller_leaf_valid, \
+  const bool is_larger_leaf_valid, \
+  const score_t* grad_scale, \
+  const score_t* hess_scale, \
+  const uint8_t smaller_num_bits_in_histogram_bins, \
+  const uint8_t larger_num_bits_in_histogram_bins
+
+#define LaunchFindBestSplitsDiscretizedForLeafKernel_ARGS \
+  smaller_leaf_splits, \
+  larger_leaf_splits, \
+  smaller_leaf_index, \
+  larger_leaf_index, \
+  is_smaller_leaf_valid, \
+  is_larger_leaf_valid, \
+  grad_scale, \
+  hess_scale, \
+  smaller_num_bits_in_histogram_bins, \
+  larger_num_bits_in_histogram_bins
+
+#define FindBestSplitsDiscretizedForLeafKernel_ARGS \
+    cuda_is_feature_used_bytree_, \
+    num_tasks_, \
+    cuda_split_find_tasks_.RawData(), \
+    cuda_randoms_.RawData(), \
+    smaller_leaf_splits, \
+    larger_leaf_splits, \
+    smaller_num_bits_in_histogram_bins, \
+    larger_num_bits_in_histogram_bins, \
+    min_data_in_leaf_, \
+    min_sum_hessian_in_leaf_, \
+    min_gain_to_split_, \
+    lambda_l1_, \
+    lambda_l2_, \
+    path_smooth_, \
+    cat_smooth_, \
+    cat_l2_, \
+    max_cat_threshold_, \
+    min_data_per_group_, \
+    max_cat_to_onehot_, \
+    grad_scale, \
+    hess_scale, \
+    cuda_best_split_info_
+
+void CUDABestSplitFinder::LaunchFindBestSplitsDiscretizedForLeafKernel(LaunchFindBestSplitsDiscretizedForLeafKernel_PARAMS) {
+  if (!is_smaller_leaf_valid && !is_larger_leaf_valid) {
+    return;
+  }
+  if (!extra_trees_) {
+    LaunchFindBestSplitsDiscretizedForLeafKernelInner0<false>(LaunchFindBestSplitsDiscretizedForLeafKernel_ARGS);
+  } else {
+    LaunchFindBestSplitsDiscretizedForLeafKernelInner0<true>(LaunchFindBestSplitsDiscretizedForLeafKernel_ARGS);
+  }
+}
+
+template <bool USE_RAND>
+void CUDABestSplitFinder::LaunchFindBestSplitsDiscretizedForLeafKernelInner0(LaunchFindBestSplitsDiscretizedForLeafKernel_PARAMS) {
+  if (lambda_l1_ <= 0.0f) {
+    LaunchFindBestSplitsDiscretizedForLeafKernelInner1<USE_RAND, false>(LaunchFindBestSplitsDiscretizedForLeafKernel_ARGS);
+  } else {
+    LaunchFindBestSplitsDiscretizedForLeafKernelInner1<USE_RAND, true>(LaunchFindBestSplitsDiscretizedForLeafKernel_ARGS);
+  }
+}
+
+template <bool USE_RAND, bool USE_L1>
+void CUDABestSplitFinder::LaunchFindBestSplitsDiscretizedForLeafKernelInner1(LaunchFindBestSplitsDiscretizedForLeafKernel_PARAMS) {
+  if (!use_smoothing_) {
+    LaunchFindBestSplitsDiscretizedForLeafKernelInner2<USE_RAND, USE_L1, false>(LaunchFindBestSplitsDiscretizedForLeafKernel_ARGS);
+  } else {
+    LaunchFindBestSplitsDiscretizedForLeafKernelInner2<USE_RAND, USE_L1, true>(LaunchFindBestSplitsDiscretizedForLeafKernel_ARGS);
+  }
+}
+
+template <bool USE_RAND, bool USE_L1, bool USE_SMOOTHING>
+void CUDABestSplitFinder::LaunchFindBestSplitsDiscretizedForLeafKernelInner2(LaunchFindBestSplitsDiscretizedForLeafKernel_PARAMS) {
+  if (!use_global_memory_) {
+    if (is_smaller_leaf_valid) {
+      FindBestSplitsDiscretizedForLeafKernel<USE_RAND, USE_L1, USE_SMOOTHING, false>
+        <<<num_tasks_, NUM_THREADS_PER_BLOCK_BEST_SPLIT_FINDER, 0, cuda_streams_[0]>>>
+        (FindBestSplitsDiscretizedForLeafKernel_ARGS);
+    }
+    SynchronizeCUDADevice(__FILE__, __LINE__);
+    if (is_larger_leaf_valid) {
+      FindBestSplitsDiscretizedForLeafKernel<USE_RAND, USE_L1, USE_SMOOTHING, true>
+        <<<num_tasks_, NUM_THREADS_PER_BLOCK_BEST_SPLIT_FINDER, 0, cuda_streams_[1]>>>
+        (FindBestSplitsDiscretizedForLeafKernel_ARGS);
+    }
+  } else {
+    // TODO(shiyu1994)
+  }
+}
+
+#undef LaunchFindBestSplitsDiscretizedForLeafKernel_PARAMS
+#undef LaunchFindBestSplitsDiscretizedForLeafKernel_ARGS
+#undef FindBestSplitsDiscretizedForLeafKernel_ARGS
+
+
 __device__ void ReduceBestSplit(bool* found, double* gain, uint32_t* shared_read_index,
   uint32_t num_features_aligned) {
   const uint32_t threadIdx_x = threadIdx.x;
diff --git a/src/treelearner/cuda/cuda_best_split_finder.hpp b/src/treelearner/cuda/cuda_best_split_finder.hpp
index 69f8169f8d85..2d9940312533 100644
--- a/src/treelearner/cuda/cuda_best_split_finder.hpp
+++ b/src/treelearner/cuda/cuda_best_split_finder.hpp
@@ -67,7 +67,11 @@ class CUDABestSplitFinder {
     const data_size_t num_data_in_smaller_leaf,
     const data_size_t num_data_in_larger_leaf,
     const double sum_hessians_in_smaller_leaf,
-    const double sum_hessians_in_larger_leaf);
+    const double sum_hessians_in_larger_leaf,
+    const score_t* grad_scale,
+    const score_t* hess_scale,
+    const uint8_t smaller_num_bits_in_histogram_bins,
+    const uint8_t larger_num_bits_in_histogram_bins);
 
   const CUDASplitInfo* FindBestFromAllSplits(
     const int cur_num_leaves,
@@ -114,6 +118,31 @@ class CUDABestSplitFinder {
 
   #undef LaunchFindBestSplitsForLeafKernel_PARAMS
 
+  #define LaunchFindBestSplitsDiscretizedForLeafKernel_PARAMS \
+  const CUDALeafSplitsStruct* smaller_leaf_splits, \
+  const CUDALeafSplitsStruct* larger_leaf_splits, \
+  const int smaller_leaf_index, \
+  const int larger_leaf_index, \
+  const bool is_smaller_leaf_valid, \
+  const bool is_larger_leaf_valid, \
+  const score_t* grad_scale, \
+  const score_t* hess_scale, \
+  const uint8_t smaller_num_bits_in_histogram_bins, \
+  const uint8_t larger_num_bits_in_histogram_bins
+
+  void LaunchFindBestSplitsDiscretizedForLeafKernel(LaunchFindBestSplitsDiscretizedForLeafKernel_PARAMS);
+
+  template <bool USE_RAND>
+  void LaunchFindBestSplitsDiscretizedForLeafKernelInner0(LaunchFindBestSplitsDiscretizedForLeafKernel_PARAMS);
+
+  template <bool USE_RAND, bool USE_L1>
+  void LaunchFindBestSplitsDiscretizedForLeafKernelInner1(LaunchFindBestSplitsDiscretizedForLeafKernel_PARAMS);
+
+  template <bool USE_RAND, bool USE_L1, bool USE_SMOOTHING>
+  void LaunchFindBestSplitsDiscretizedForLeafKernelInner2(LaunchFindBestSplitsDiscretizedForLeafKernel_PARAMS);
+
+  #undef LaunchFindBestSplitsDiscretizedForLeafKernel_PARAMS
+
   void LaunchSyncBestSplitForLeafKernel(
     const int host_smaller_leaf_index,
     const int host_larger_leaf_index,
diff --git a/src/treelearner/cuda/cuda_data_partition.cpp b/src/treelearner/cuda/cuda_data_partition.cpp
index 3ad157ef0105..c09021ad356f 100644
--- a/src/treelearner/cuda/cuda_data_partition.cpp
+++ b/src/treelearner/cuda/cuda_data_partition.cpp
@@ -368,6 +368,12 @@ void CUDADataPartition::ResetByLeafPred(const std::vector<int>& leaf_pred, int n
   cur_num_leaves_ = num_leaves;
 }
 
+void CUDADataPartition::ReduceLeafGradStat(
+  const score_t* gradients, const score_t* hessians,
+  CUDATree* tree, double* leaf_grad_stat_buffer, double* leaf_hess_state_buffer) const {
+  LaunchReduceLeafGradStat(gradients, hessians, tree, leaf_grad_stat_buffer, leaf_hess_state_buffer);
+}
+
 }  // namespace LightGBM
 
 #endif  // USE_CUDA
diff --git a/src/treelearner/cuda/cuda_data_partition.cu b/src/treelearner/cuda/cuda_data_partition.cu
index b1d3fa496ab9..3090b7a84176 100644
--- a/src/treelearner/cuda/cuda_data_partition.cu
+++ b/src/treelearner/cuda/cuda_data_partition.cu
@@ -1069,6 +1069,53 @@ void CUDADataPartition::LaunchAddPredictionToScoreKernel(const double* leaf_valu
   global_timer.Stop("CUDADataPartition::AddPredictionToScoreKernel");
 }
 
+__global__ void RenewDiscretizedTreeLeavesKernel(
+  const score_t* gradients,
+  const score_t* hessians,
+  const data_size_t* data_indices,
+  const data_size_t* leaf_data_start,
+  const data_size_t* leaf_num_data,
+  double* leaf_grad_stat_buffer,
+  double* leaf_hess_stat_buffer,
+  double* leaf_values) {
+  __shared__ double shared_mem_buffer[32];
+  const int leaf_index = static_cast<int>(blockIdx.x);
+  const data_size_t* data_indices_in_leaf = data_indices + leaf_data_start[leaf_index];
+  const data_size_t num_data_in_leaf = leaf_num_data[leaf_index];
+  double sum_gradients = 0.0f;
+  double sum_hessians = 0.0f;
+  for (data_size_t inner_data_index = static_cast<int>(threadIdx.x);
+    inner_data_index < num_data_in_leaf; inner_data_index += static_cast<int>(blockDim.x)) {
+    const data_size_t data_index = data_indices_in_leaf[inner_data_index];
+    const score_t gradient = gradients[data_index];
+    const score_t hessian = hessians[data_index];
+    sum_gradients += static_cast<double>(gradient);
+    sum_hessians += static_cast<double>(hessian);
+  }
+  sum_gradients = ShuffleReduceSum<double>(sum_gradients, shared_mem_buffer, blockDim.x);
+  __syncthreads();
+  sum_hessians = ShuffleReduceSum<double>(sum_hessians, shared_mem_buffer, blockDim.x);
+  if (threadIdx.x == 0) {
+    leaf_grad_stat_buffer[leaf_index] = sum_gradients;
+    leaf_hess_stat_buffer[leaf_index] = sum_hessians;
+  }
+}
+
+void CUDADataPartition::LaunchReduceLeafGradStat(
+  const score_t* gradients, const score_t* hessians,
+  CUDATree* tree, double* leaf_grad_stat_buffer, double* leaf_hess_state_buffer) const {
+  const int num_blocks = tree->num_leaves();
+  RenewDiscretizedTreeLeavesKernel<<<num_blocks, FILL_INDICES_BLOCK_SIZE_DATA_PARTITION>>>(
+    gradients,
+    hessians,
+    cuda_data_indices_,
+    cuda_leaf_data_start_,
+    cuda_leaf_num_data_,
+    leaf_grad_stat_buffer,
+    leaf_hess_state_buffer,
+    tree->cuda_leaf_value_ref());
+}
+
 }  // namespace LightGBM
 
 #endif  // USE_CUDA
diff --git a/src/treelearner/cuda/cuda_data_partition.hpp b/src/treelearner/cuda/cuda_data_partition.hpp
index 84050565c085..f6bbab9b8c65 100644
--- a/src/treelearner/cuda/cuda_data_partition.hpp
+++ b/src/treelearner/cuda/cuda_data_partition.hpp
@@ -78,6 +78,10 @@ class CUDADataPartition {
 
   void ResetByLeafPred(const std::vector<int>& leaf_pred, int num_leaves);
 
+  void ReduceLeafGradStat(
+    const score_t* gradients, const score_t* hessians,
+    CUDATree* tree, double* leaf_grad_stat_buffer, double* leaf_hess_state_buffer) const;
+
   data_size_t root_num_data() const {
     if (use_bagging_) {
       return num_used_indices_;
@@ -292,6 +296,10 @@ class CUDADataPartition {
 
   void LaunchFillDataIndexToLeafIndex();
 
+  void LaunchReduceLeafGradStat(
+    const score_t* gradients, const score_t* hessians,
+    CUDATree* tree, double* leaf_grad_stat_buffer, double* leaf_hess_state_buffer) const;
+
   // Host memory
 
   // dataset information
diff --git a/src/treelearner/cuda/cuda_gradient_discretizer.cu b/src/treelearner/cuda/cuda_gradient_discretizer.cu
new file mode 100644
index 000000000000..bcea706b4285
--- /dev/null
+++ b/src/treelearner/cuda/cuda_gradient_discretizer.cu
@@ -0,0 +1,171 @@
+/*!
+ * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for
+ * license information.
+ */
+
+#ifdef USE_CUDA
+
+#include <algorithm>
+
+#include <LightGBM/cuda/cuda_algorithms.hpp>
+
+#include "cuda_gradient_discretizer.hpp"
+
+namespace LightGBM {
+
+__global__ void ReduceMinMaxKernel(
+  const data_size_t num_data,
+  const score_t* input_gradients,
+  const score_t* input_hessians,
+  score_t* grad_min_block_buffer,
+  score_t* grad_max_block_buffer,
+  score_t* hess_min_block_buffer,
+  score_t* hess_max_block_buffer) {
+  __shared__ score_t shared_mem_buffer[32];
+  const data_size_t index = static_cast<data_size_t>(threadIdx.x + blockIdx.x * blockDim.x);
+  score_t grad_max_val = kMinScore;
+  score_t grad_min_val = kMaxScore;
+  score_t hess_max_val = kMinScore;
+  score_t hess_min_val = kMaxScore;
+  if (index < num_data) {
+    grad_max_val = input_gradients[index];
+    grad_min_val = input_gradients[index];
+    hess_max_val = input_hessians[index];
+    hess_min_val = input_hessians[index];
+  }
+  grad_min_val = ShuffleReduceMin<score_t>(grad_min_val, shared_mem_buffer, blockDim.x);
+  __syncthreads();
+  grad_max_val = ShuffleReduceMax<score_t>(grad_max_val, shared_mem_buffer, blockDim.x);
+  __syncthreads();
+  hess_min_val = ShuffleReduceMin<score_t>(hess_min_val, shared_mem_buffer, blockDim.x);
+  __syncthreads();
+  hess_max_val = ShuffleReduceMax<score_t>(hess_max_val, shared_mem_buffer, blockDim.x);
+  if (threadIdx.x == 0) {
+    grad_min_block_buffer[blockIdx.x] = grad_min_val;
+    grad_max_block_buffer[blockIdx.x] = grad_max_val;
+    hess_min_block_buffer[blockIdx.x] = hess_min_val;
+    hess_max_block_buffer[blockIdx.x] = hess_max_val;
+  }
+}
+
+__global__ void ReduceBlockMinMaxKernel(
+  const int num_blocks,
+  const int grad_discretize_bins,
+  score_t* grad_min_block_buffer,
+  score_t* grad_max_block_buffer,
+  score_t* hess_min_block_buffer,
+  score_t* hess_max_block_buffer) {
+  __shared__ score_t shared_mem_buffer[32];
+  score_t grad_max_val = kMinScore;
+  score_t grad_min_val = kMaxScore;
+  score_t hess_max_val = kMinScore;
+  score_t hess_min_val = kMaxScore;
+  for (int block_index = static_cast<int>(threadIdx.x); block_index < num_blocks; block_index += static_cast<int>(blockDim.x)) {
+    grad_min_val = min(grad_min_val, grad_min_block_buffer[block_index]);
+    grad_max_val = max(grad_max_val, grad_max_block_buffer[block_index]);
+    hess_min_val = min(hess_min_val, hess_min_block_buffer[block_index]);
+    hess_max_val = max(hess_max_val, hess_max_block_buffer[block_index]);
+  }
+  grad_min_val = ShuffleReduceMin<score_t>(grad_min_val, shared_mem_buffer, blockDim.x);
+  __syncthreads();
+  grad_max_val = ShuffleReduceMax<score_t>(grad_max_val, shared_mem_buffer, blockDim.x);
+  __syncthreads();
+  hess_max_val = ShuffleReduceMax<score_t>(hess_max_val, shared_mem_buffer, blockDim.x);
+  __syncthreads();
+  hess_max_val = ShuffleReduceMax<score_t>(hess_max_val, shared_mem_buffer, blockDim.x);
+  if (threadIdx.x == 0) {
+    const score_t grad_abs_max = max(fabs(grad_min_val), fabs(grad_max_val));
+    const score_t hess_abs_max = max(fabs(hess_min_val), fabs(hess_max_val));
+    grad_min_block_buffer[0] = 1.0f / (grad_abs_max / (grad_discretize_bins / 2));
+    grad_max_block_buffer[0] = (grad_abs_max / (grad_discretize_bins / 2));
+    hess_min_block_buffer[0] = 1.0f / (hess_abs_max / (grad_discretize_bins));
+    hess_max_block_buffer[0] = (hess_abs_max / (grad_discretize_bins));
+  }
+}
+
+template <bool STOCHASTIC_ROUNDING>
+__global__ void DiscretizeGradientsKernel(
+  const data_size_t num_data,
+  const score_t* input_gradients,
+  const score_t* input_hessians,
+  const score_t* grad_scale_ptr,
+  const score_t* hess_scale_ptr,
+  const int iter,
+  const int* random_values_use_start,
+  const score_t* gradient_random_values,
+  const score_t* hessian_random_values,
+  const int grad_discretize_bins,
+  int8_t* output_gradients_and_hessians) {
+  const int start = random_values_use_start[iter];
+  const data_size_t index = static_cast<data_size_t>(threadIdx.x + blockIdx.x * blockDim.x);
+  const score_t grad_scale = *grad_scale_ptr;
+  const score_t hess_scale = *hess_scale_ptr;
+  int16_t* output_gradients_and_hessians_ptr = reinterpret_cast<int16_t*>(output_gradients_and_hessians);
+  if (index < num_data) {
+    if (STOCHASTIC_ROUNDING) {
+      const data_size_t index_offset = (index + start) % num_data;
+      const score_t gradient = input_gradients[index];
+      const score_t hessian = input_hessians[index];
+      const score_t gradient_random_value = gradient_random_values[index_offset];
+      const score_t hessian_random_value = hessian_random_values[index_offset];
+      output_gradients_and_hessians_ptr[2 * index + 1] = gradient > 0.0f ?
+        static_cast<int16_t>(gradient * grad_scale + gradient_random_value) :
+        static_cast<int16_t>(gradient * grad_scale - gradient_random_value);
+      output_gradients_and_hessians_ptr[2 * index] = static_cast<int16_t>(hessian * hess_scale + hessian_random_value);
+    } else {
+      const score_t gradient = input_gradients[index];
+      const score_t hessian = input_hessians[index];
+      output_gradients_and_hessians_ptr[2 * index + 1] = gradient > 0.0f ?
+        static_cast<int16_t>(gradient * grad_scale + 0.5) :
+        static_cast<int16_t>(gradient * grad_scale - 0.5);
+      output_gradients_and_hessians_ptr[2 * index] = static_cast<int16_t>(hessian * hess_scale + 0.5);
+    }
+  }
+}
+
+void CUDAGradientDiscretizer::DiscretizeGradients(
+  const data_size_t num_data,
+  const score_t* input_gradients,
+  const score_t* input_hessians) {
+  ReduceMinMaxKernel<<<num_reduce_blocks_, CUDA_GRADIENT_DISCRETIZER_BLOCK_SIZE>>>(
+    num_data, input_gradients, input_hessians,
+    grad_min_block_buffer_.RawData(),
+    grad_max_block_buffer_.RawData(),
+    hess_min_block_buffer_.RawData(),
+    hess_max_block_buffer_.RawData());
+    SynchronizeCUDADevice(__FILE__, __LINE__);
+  ReduceBlockMinMaxKernel<<<1, CUDA_GRADIENT_DISCRETIZER_BLOCK_SIZE>>>(
+    num_reduce_blocks_,
+    num_grad_quant_bins_,
+    grad_min_block_buffer_.RawData(),
+    grad_max_block_buffer_.RawData(),
+    hess_min_block_buffer_.RawData(),
+    hess_max_block_buffer_.RawData());
+    SynchronizeCUDADevice(__FILE__, __LINE__);
+
+  #define DiscretizeGradientsKernel_ARGS \
+    num_data, \
+    input_gradients, \
+    input_hessians, \
+    grad_min_block_buffer_.RawData(), \
+    hess_min_block_buffer_.RawData(), \
+    iter_, \
+    random_values_use_start_.RawData(), \
+    gradient_random_values_.RawData(), \
+    hessian_random_values_.RawData(), \
+    num_grad_quant_bins_, \
+    discretized_gradients_and_hessians_.RawData()
+
+  if (stochastic_rounding_) {
+    DiscretizeGradientsKernel<true><<<num_reduce_blocks_, CUDA_GRADIENT_DISCRETIZER_BLOCK_SIZE>>>(DiscretizeGradientsKernel_ARGS);
+  } else {
+    DiscretizeGradientsKernel<false><<<num_reduce_blocks_, CUDA_GRADIENT_DISCRETIZER_BLOCK_SIZE>>>(DiscretizeGradientsKernel_ARGS);
+  }
+  SynchronizeCUDADevice(__FILE__, __LINE__);
+  ++iter_;
+}
+
+}  // namespace LightGBM
+
+#endif  // USE_CUDA
diff --git a/src/treelearner/cuda/cuda_gradient_discretizer.hpp b/src/treelearner/cuda/cuda_gradient_discretizer.hpp
new file mode 100644
index 000000000000..d5c2fb0e041a
--- /dev/null
+++ b/src/treelearner/cuda/cuda_gradient_discretizer.hpp
@@ -0,0 +1,118 @@
+/*!
+ * Copyright (c) 2021 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for
+ * license information.
+ */
+
+#ifndef LIGHTGBM_TREELEARNER_CUDA_CUDA_GRADIENT_DISCRETIZER_HPP_
+#define LIGHTGBM_TREELEARNER_CUDA_CUDA_GRADIENT_DISCRETIZER_HPP_
+
+#ifdef USE_CUDA
+
+#include <LightGBM/bin.h>
+#include <LightGBM/meta.h>
+#include <LightGBM/cuda/cuda_utils.hu>
+#include <LightGBM/utils/threading.h>
+
+#include <algorithm>
+#include <random>
+#include <vector>
+
+#include "cuda_leaf_splits.hpp"
+#include "../gradient_discretizer.hpp"
+
+namespace LightGBM {
+
+#define CUDA_GRADIENT_DISCRETIZER_BLOCK_SIZE (1024)
+
+class CUDAGradientDiscretizer: public GradientDiscretizer {
+ public:
+  CUDAGradientDiscretizer(int num_grad_quant_bins, int num_trees, int random_seed, bool is_constant_hessian, bool stochastic_roudning):
+    GradientDiscretizer(num_grad_quant_bins, num_trees, random_seed, is_constant_hessian, stochastic_roudning) {
+  }
+
+  void DiscretizeGradients(
+    const data_size_t num_data,
+    const score_t* input_gradients,
+    const score_t* input_hessians) override;
+
+  const int8_t* discretized_gradients_and_hessians() const override { return discretized_gradients_and_hessians_.RawData(); }
+
+  double grad_scale() const override {
+    Log::Fatal("grad_scale() of CUDAGradientDiscretizer should not be called.");
+    return 0.0;
+  }
+
+  double hess_scale() const override {
+    Log::Fatal("hess_scale() of CUDAGradientDiscretizer should not be called.");
+    return 0.0;
+  }
+
+  const score_t* grad_scale_ptr() const { return grad_max_block_buffer_.RawData(); }
+
+  const score_t* hess_scale_ptr() const { return hess_max_block_buffer_.RawData(); }
+
+  void Init(const data_size_t num_data, const int num_leaves,
+    const int num_features, const Dataset* train_data) override {
+    GradientDiscretizer::Init(num_data, num_leaves, num_features, train_data);
+    discretized_gradients_and_hessians_.Resize(num_data * 2);
+    num_reduce_blocks_ = (num_data + CUDA_GRADIENT_DISCRETIZER_BLOCK_SIZE - 1) / CUDA_GRADIENT_DISCRETIZER_BLOCK_SIZE;
+    grad_min_block_buffer_.Resize(num_reduce_blocks_);
+    grad_max_block_buffer_.Resize(num_reduce_blocks_);
+    hess_min_block_buffer_.Resize(num_reduce_blocks_);
+    hess_max_block_buffer_.Resize(num_reduce_blocks_);
+    random_values_use_start_.Resize(num_trees_);
+    gradient_random_values_.Resize(num_data);
+    hessian_random_values_.Resize(num_data);
+
+    std::vector<score_t> gradient_random_values(num_data, 0.0f);
+    std::vector<score_t> hessian_random_values(num_data, 0.0f);
+    std::vector<int> random_values_use_start(num_trees_, 0);
+
+    const int num_threads = OMP_NUM_THREADS();
+
+    std::mt19937 random_values_use_start_eng = std::mt19937(random_seed_);
+    std::uniform_int_distribution<data_size_t> random_values_use_start_dist = std::uniform_int_distribution<data_size_t>(0, num_data);
+    for (int tree_index = 0; tree_index < num_trees_; ++tree_index) {
+      random_values_use_start[tree_index] = random_values_use_start_dist(random_values_use_start_eng);
+    }
+
+    int num_blocks = 0;
+    data_size_t block_size = 0;
+    Threading::BlockInfo<data_size_t>(num_data, 512, &num_blocks, &block_size);
+    #pragma omp parallel for schedule(static, 1) num_threads(num_threads)
+    for (int thread_id = 0; thread_id < num_blocks; ++thread_id) {
+      const data_size_t start = thread_id * block_size;
+      const data_size_t end = std::min(start + block_size, num_data);
+      std::mt19937 gradient_random_values_eng(random_seed_ + thread_id);
+      std::uniform_real_distribution<double> gradient_random_values_dist(0.0f, 1.0f);
+      std::mt19937 hessian_random_values_eng(random_seed_ + thread_id + num_threads);
+      std::uniform_real_distribution<double> hessian_random_values_dist(0.0f, 1.0f);
+      for (data_size_t i = start; i < end; ++i) {
+        gradient_random_values[i] = gradient_random_values_dist(gradient_random_values_eng);
+        hessian_random_values[i] = hessian_random_values_dist(hessian_random_values_eng);
+      }
+    }
+
+    CopyFromHostToCUDADevice<score_t>(gradient_random_values_.RawData(), gradient_random_values.data(), gradient_random_values.size(), __FILE__, __LINE__);
+    CopyFromHostToCUDADevice<score_t>(hessian_random_values_.RawData(), hessian_random_values.data(), hessian_random_values.size(), __FILE__, __LINE__);
+    CopyFromHostToCUDADevice<int>(random_values_use_start_.RawData(), random_values_use_start.data(), random_values_use_start.size(), __FILE__, __LINE__);
+    iter_ = 0;
+  }
+
+ protected:
+  mutable CUDAVector<int8_t> discretized_gradients_and_hessians_;
+  mutable CUDAVector<score_t> grad_min_block_buffer_;
+  mutable CUDAVector<score_t> grad_max_block_buffer_;
+  mutable CUDAVector<score_t> hess_min_block_buffer_;
+  mutable CUDAVector<score_t> hess_max_block_buffer_;
+  CUDAVector<int> random_values_use_start_;
+  CUDAVector<score_t> gradient_random_values_;
+  CUDAVector<score_t> hessian_random_values_;
+  int num_reduce_blocks_;
+};
+
+}  // namespace LightGBM
+
+#endif  // USE_CUDA
+#endif  // LIGHTGBM_TREELEARNER_CUDA_CUDA_GRADIENT_DISCRETIZER_HPP_
diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cpp b/src/treelearner/cuda/cuda_histogram_constructor.cpp
index 7e6be1c1069c..659db2aad24c 100644
--- a/src/treelearner/cuda/cuda_histogram_constructor.cpp
+++ b/src/treelearner/cuda/cuda_histogram_constructor.cpp
@@ -20,7 +20,9 @@ CUDAHistogramConstructor::CUDAHistogramConstructor(
   const int min_data_in_leaf,
   const double min_sum_hessian_in_leaf,
   const int gpu_device_id,
-  const bool gpu_use_dp):
+  const bool gpu_use_dp,
+  const bool use_quantized_grad,
+  const int num_grad_quant_bins):
   num_data_(train_data->num_data()),
   num_features_(train_data->num_features()),
   num_leaves_(num_leaves),
@@ -28,24 +30,14 @@ CUDAHistogramConstructor::CUDAHistogramConstructor(
   min_data_in_leaf_(min_data_in_leaf),
   min_sum_hessian_in_leaf_(min_sum_hessian_in_leaf),
   gpu_device_id_(gpu_device_id),
-  gpu_use_dp_(gpu_use_dp) {
+  gpu_use_dp_(gpu_use_dp),
+  use_quantized_grad_(use_quantized_grad),
+  num_grad_quant_bins_(num_grad_quant_bins) {
   InitFeatureMetaInfo(train_data, feature_hist_offsets);
   cuda_row_data_.reset(nullptr);
-  cuda_feature_num_bins_ = nullptr;
-  cuda_feature_hist_offsets_ = nullptr;
-  cuda_feature_most_freq_bins_ = nullptr;
-  cuda_hist_ = nullptr;
-  cuda_need_fix_histogram_features_ = nullptr;
-  cuda_need_fix_histogram_features_num_bin_aligned_ = nullptr;
 }
 
 CUDAHistogramConstructor::~CUDAHistogramConstructor() {
-  DeallocateCUDAMemory<uint32_t>(&cuda_feature_num_bins_, __FILE__, __LINE__);
-  DeallocateCUDAMemory<uint32_t>(&cuda_feature_hist_offsets_, __FILE__, __LINE__);
-  DeallocateCUDAMemory<uint32_t>(&cuda_feature_most_freq_bins_, __FILE__, __LINE__);
-  DeallocateCUDAMemory<hist_t>(&cuda_hist_, __FILE__, __LINE__);
-  DeallocateCUDAMemory<int>(&cuda_need_fix_histogram_features_, __FILE__, __LINE__);
-  DeallocateCUDAMemory<uint32_t>(&cuda_need_fix_histogram_features_num_bin_aligned_, __FILE__, __LINE__);
   gpuAssert(cudaStreamDestroy(cuda_stream_), __FILE__, __LINE__);
 }
 
@@ -84,54 +76,70 @@ void CUDAHistogramConstructor::InitFeatureMetaInfo(const Dataset* train_data, co
 void CUDAHistogramConstructor::BeforeTrain(const score_t* gradients, const score_t* hessians) {
   cuda_gradients_ = gradients;
   cuda_hessians_ = hessians;
-  SetCUDAMemory<hist_t>(cuda_hist_, 0, num_total_bin_ * 2 * num_leaves_, __FILE__, __LINE__);
+  cuda_hist_.SetValue(0);
 }
 
 void CUDAHistogramConstructor::Init(const Dataset* train_data, TrainingShareStates* share_state) {
-  AllocateCUDAMemory<hist_t>(&cuda_hist_, num_total_bin_ * 2 * num_leaves_, __FILE__, __LINE__);
-  SetCUDAMemory<hist_t>(cuda_hist_, 0, num_total_bin_ * 2 * num_leaves_, __FILE__, __LINE__);
+  cuda_hist_.Resize(static_cast<size_t>(num_total_bin_ * 2 * num_leaves_));
+  cuda_hist_.SetValue(0);
 
-  InitCUDAMemoryFromHostMemory<uint32_t>(&cuda_feature_num_bins_,
-    feature_num_bins_.data(), feature_num_bins_.size(), __FILE__, __LINE__);
-
-  InitCUDAMemoryFromHostMemory<uint32_t>(&cuda_feature_hist_offsets_,
-    feature_hist_offsets_.data(), feature_hist_offsets_.size(), __FILE__, __LINE__);
-
-  InitCUDAMemoryFromHostMemory<uint32_t>(&cuda_feature_most_freq_bins_,
-    feature_most_freq_bins_.data(), feature_most_freq_bins_.size(), __FILE__, __LINE__);
+  cuda_feature_num_bins_.InitFromHostVector(feature_num_bins_);
+  cuda_feature_hist_offsets_.InitFromHostVector(feature_hist_offsets_);
+  cuda_feature_most_freq_bins_.InitFromHostVector(feature_most_freq_bins_);
 
   cuda_row_data_.reset(new CUDARowData(train_data, share_state, gpu_device_id_, gpu_use_dp_));
   cuda_row_data_->Init(train_data, share_state);
 
   CUDASUCCESS_OR_FATAL(cudaStreamCreate(&cuda_stream_));
 
-  InitCUDAMemoryFromHostMemory<int>(&cuda_need_fix_histogram_features_, need_fix_histogram_features_.data(), need_fix_histogram_features_.size(), __FILE__, __LINE__);
-  InitCUDAMemoryFromHostMemory<uint32_t>(&cuda_need_fix_histogram_features_num_bin_aligned_, need_fix_histogram_features_num_bin_aligend_.data(),
-    need_fix_histogram_features_num_bin_aligend_.size(), __FILE__, __LINE__);
+  cuda_need_fix_histogram_features_.InitFromHostVector(need_fix_histogram_features_);
+  cuda_need_fix_histogram_features_num_bin_aligned_.InitFromHostVector(need_fix_histogram_features_num_bin_aligend_);
 
   if (cuda_row_data_->NumLargeBinPartition() > 0) {
     int grid_dim_x = 0, grid_dim_y = 0, block_dim_x = 0, block_dim_y = 0;
     CalcConstructHistogramKernelDim(&grid_dim_x, &grid_dim_y, &block_dim_x, &block_dim_y, num_data_);
-    const size_t buffer_size = static_cast<size_t>(grid_dim_y) * static_cast<size_t>(num_total_bin_) * 2;
-    AllocateCUDAMemory<float>(&cuda_hist_buffer_, buffer_size, __FILE__, __LINE__);
+    const size_t buffer_size = static_cast<size_t>(grid_dim_y) * static_cast<size_t>(num_total_bin_);
+    if (!use_quantized_grad_) {
+      if (gpu_use_dp_) {
+        // need to double the size of histogram buffer in global memory when using double precision in histogram construction
+        cuda_hist_buffer_.Resize(buffer_size * 4);
+      } else {
+        cuda_hist_buffer_.Resize(buffer_size * 2);
+      }
+    } else {
+      // use only half the size of histogram buffer in global memory when quantized training since each gradient and hessian takes only 2 bytes
+      cuda_hist_buffer_.Resize(buffer_size);
+    }
   }
+  hist_buffer_for_num_bit_change_.Resize(num_total_bin_ * 2);
 }
 
 void CUDAHistogramConstructor::ConstructHistogramForLeaf(
   const CUDALeafSplitsStruct* cuda_smaller_leaf_splits,
-  const CUDALeafSplitsStruct* cuda_larger_leaf_splits,
+  const CUDALeafSplitsStruct* /*cuda_larger_leaf_splits*/,
   const data_size_t num_data_in_smaller_leaf,
   const data_size_t num_data_in_larger_leaf,
   const double sum_hessians_in_smaller_leaf,
-  const double sum_hessians_in_larger_leaf) {
+  const double sum_hessians_in_larger_leaf,
+  const uint8_t num_bits_in_histogram_bins) {
   if ((num_data_in_smaller_leaf <= min_data_in_leaf_ || sum_hessians_in_smaller_leaf <= min_sum_hessian_in_leaf_) &&
     (num_data_in_larger_leaf <= min_data_in_leaf_ || sum_hessians_in_larger_leaf <= min_sum_hessian_in_leaf_)) {
     return;
   }
-  LaunchConstructHistogramKernel(cuda_smaller_leaf_splits, num_data_in_smaller_leaf);
+  LaunchConstructHistogramKernel(cuda_smaller_leaf_splits, num_data_in_smaller_leaf, num_bits_in_histogram_bins);
   SynchronizeCUDADevice(__FILE__, __LINE__);
+}
+
+void CUDAHistogramConstructor::SubtractHistogramForLeaf(
+  const CUDALeafSplitsStruct* cuda_smaller_leaf_splits,
+  const CUDALeafSplitsStruct* cuda_larger_leaf_splits,
+  const bool use_quantized_grad,
+  const uint8_t parent_num_bits_in_histogram_bins,
+  const uint8_t smaller_num_bits_in_histogram_bins,
+  const uint8_t larger_num_bits_in_histogram_bins) {
   global_timer.Start("CUDAHistogramConstructor::ConstructHistogramForLeaf::LaunchSubtractHistogramKernel");
-  LaunchSubtractHistogramKernel(cuda_smaller_leaf_splits, cuda_larger_leaf_splits);
+  LaunchSubtractHistogramKernel(cuda_smaller_leaf_splits, cuda_larger_leaf_splits, use_quantized_grad,
+                                parent_num_bits_in_histogram_bins, smaller_num_bits_in_histogram_bins, larger_num_bits_in_histogram_bins);
   global_timer.Stop("CUDAHistogramConstructor::ConstructHistogramForLeaf::LaunchSubtractHistogramKernel");
 }
 
@@ -152,33 +160,18 @@ void CUDAHistogramConstructor::ResetTrainingData(const Dataset* train_data, Trai
   num_data_ = train_data->num_data();
   num_features_ = train_data->num_features();
   InitFeatureMetaInfo(train_data, share_states->feature_hist_offsets());
-  if (feature_num_bins_.size() > 0) {
-    DeallocateCUDAMemory<uint32_t>(&cuda_feature_num_bins_, __FILE__, __LINE__);
-    DeallocateCUDAMemory<uint32_t>(&cuda_feature_hist_offsets_, __FILE__, __LINE__);
-    DeallocateCUDAMemory<uint32_t>(&cuda_feature_most_freq_bins_, __FILE__, __LINE__);
-    DeallocateCUDAMemory<int>(&cuda_need_fix_histogram_features_, __FILE__, __LINE__);
-    DeallocateCUDAMemory<uint32_t>(&cuda_need_fix_histogram_features_num_bin_aligned_, __FILE__, __LINE__);
-    DeallocateCUDAMemory<hist_t>(&cuda_hist_, __FILE__, __LINE__);
-  }
-
-  AllocateCUDAMemory<hist_t>(&cuda_hist_, num_total_bin_ * 2 * num_leaves_, __FILE__, __LINE__);
-  SetCUDAMemory<hist_t>(cuda_hist_, 0, num_total_bin_ * 2 * num_leaves_, __FILE__, __LINE__);
-
-  InitCUDAMemoryFromHostMemory<uint32_t>(&cuda_feature_num_bins_,
-    feature_num_bins_.data(), feature_num_bins_.size(), __FILE__, __LINE__);
-
-  InitCUDAMemoryFromHostMemory<uint32_t>(&cuda_feature_hist_offsets_,
-    feature_hist_offsets_.data(), feature_hist_offsets_.size(), __FILE__, __LINE__);
 
-  InitCUDAMemoryFromHostMemory<uint32_t>(&cuda_feature_most_freq_bins_,
-    feature_most_freq_bins_.data(), feature_most_freq_bins_.size(), __FILE__, __LINE__);
+  cuda_hist_.Resize(static_cast<size_t>(num_total_bin_ * 2 * num_leaves_));
+  cuda_hist_.SetValue(0);
+  cuda_feature_num_bins_.InitFromHostVector(feature_num_bins_);
+  cuda_feature_hist_offsets_.InitFromHostVector(feature_hist_offsets_);
+  cuda_feature_most_freq_bins_.InitFromHostVector(feature_most_freq_bins_);
 
   cuda_row_data_.reset(new CUDARowData(train_data, share_states, gpu_device_id_, gpu_use_dp_));
   cuda_row_data_->Init(train_data, share_states);
 
-  InitCUDAMemoryFromHostMemory<int>(&cuda_need_fix_histogram_features_, need_fix_histogram_features_.data(), need_fix_histogram_features_.size(), __FILE__, __LINE__);
-  InitCUDAMemoryFromHostMemory<uint32_t>(&cuda_need_fix_histogram_features_num_bin_aligned_, need_fix_histogram_features_num_bin_aligend_.data(),
-    need_fix_histogram_features_num_bin_aligend_.size(), __FILE__, __LINE__);
+  cuda_need_fix_histogram_features_.InitFromHostVector(need_fix_histogram_features_);
+  cuda_need_fix_histogram_features_num_bin_aligned_.InitFromHostVector(need_fix_histogram_features_num_bin_aligend_);
 }
 
 void CUDAHistogramConstructor::ResetConfig(const Config* config) {
@@ -186,9 +179,8 @@ void CUDAHistogramConstructor::ResetConfig(const Config* config) {
   num_leaves_ = config->num_leaves;
   min_data_in_leaf_ = config->min_data_in_leaf;
   min_sum_hessian_in_leaf_ = config->min_sum_hessian_in_leaf;
-  DeallocateCUDAMemory<hist_t>(&cuda_hist_, __FILE__, __LINE__);
-  AllocateCUDAMemory<hist_t>(&cuda_hist_, num_total_bin_ * 2 * num_leaves_, __FILE__, __LINE__);
-  SetCUDAMemory<hist_t>(cuda_hist_, 0, num_total_bin_ * 2 * num_leaves_, __FILE__, __LINE__);
+  cuda_hist_.Resize(static_cast<size_t>(num_total_bin_ * 2 * num_leaves_));
+  cuda_hist_.SetValue(0);
 }
 
 }  // namespace LightGBM
diff --git a/src/treelearner/cuda/cuda_histogram_constructor.cu b/src/treelearner/cuda/cuda_histogram_constructor.cu
index c884383304a4..03d3b8979439 100644
--- a/src/treelearner/cuda/cuda_histogram_constructor.cu
+++ b/src/treelearner/cuda/cuda_histogram_constructor.cu
@@ -125,7 +125,7 @@ __global__ void CUDAConstructHistogramSparseKernel(
   }
 }
 
-template <typename BIN_TYPE>
+template <typename BIN_TYPE, typename HIST_TYPE>
 __global__ void CUDAConstructHistogramDenseKernel_GlobalMemory(
   const CUDALeafSplitsStruct* smaller_leaf_splits,
   const score_t* cuda_gradients,
@@ -135,7 +135,7 @@ __global__ void CUDAConstructHistogramDenseKernel_GlobalMemory(
   const uint32_t* column_hist_offsets_full,
   const int* feature_partition_column_index_offsets,
   const data_size_t num_data,
-  float* global_hist_buffer) {
+  HIST_TYPE* global_hist_buffer) {
   const int dim_y = static_cast<int>(gridDim.y * blockDim.y);
   const data_size_t num_data_in_smaller_leaf = smaller_leaf_splits->num_data_in_leaf;
   const data_size_t num_data_per_thread = (num_data_in_smaller_leaf + dim_y - 1) / dim_y;
@@ -150,7 +150,7 @@ __global__ void CUDAConstructHistogramDenseKernel_GlobalMemory(
   const uint32_t num_items_in_partition = (partition_hist_end - partition_hist_start) << 1;
   const unsigned int thread_idx = threadIdx.x + threadIdx.y * blockDim.x;
   const int num_total_bin = column_hist_offsets_full[gridDim.x];
-  float* shared_hist = global_hist_buffer + (blockIdx.y * num_total_bin + partition_hist_start) * 2;
+  HIST_TYPE* shared_hist = global_hist_buffer + (blockIdx.y * num_total_bin + partition_hist_start) * 2;
   for (unsigned int i = thread_idx; i < num_items_in_partition; i += num_threads_per_block) {
     shared_hist[i] = 0.0f;
   }
@@ -166,14 +166,14 @@ __global__ void CUDAConstructHistogramDenseKernel_GlobalMemory(
   data_size_t inner_data_index = static_cast<data_size_t>(threadIdx_y);
   const int column_index = static_cast<int>(threadIdx.x) + partition_column_start;
   if (threadIdx.x < static_cast<unsigned int>(num_columns_in_partition)) {
-    float* shared_hist_ptr = shared_hist + (column_hist_offsets[column_index] << 1);
+    HIST_TYPE* shared_hist_ptr = shared_hist + (column_hist_offsets[column_index] << 1);
     for (data_size_t i = 0; i < num_iteration_this; ++i) {
       const data_size_t data_index = data_indices_ref_this_block[inner_data_index];
       const score_t grad = cuda_gradients[data_index];
       const score_t hess = cuda_hessians[data_index];
       const uint32_t bin = static_cast<uint32_t>(data_ptr[static_cast<size_t>(data_index) * num_columns_in_partition + threadIdx.x]);
       const uint32_t pos = bin << 1;
-      float* pos_ptr = shared_hist_ptr + pos;
+      HIST_TYPE* pos_ptr = shared_hist_ptr + pos;
       atomicAdd_block(pos_ptr, grad);
       atomicAdd_block(pos_ptr + 1, hess);
       inner_data_index += blockDim.y;
@@ -186,7 +186,7 @@ __global__ void CUDAConstructHistogramDenseKernel_GlobalMemory(
   }
 }
 
-template <typename BIN_TYPE, typename DATA_PTR_TYPE>
+template <typename BIN_TYPE, typename HIST_TYPE, typename DATA_PTR_TYPE>
 __global__ void CUDAConstructHistogramSparseKernel_GlobalMemory(
   const CUDALeafSplitsStruct* smaller_leaf_splits,
   const score_t* cuda_gradients,
@@ -196,7 +196,7 @@ __global__ void CUDAConstructHistogramSparseKernel_GlobalMemory(
   const DATA_PTR_TYPE* partition_ptr,
   const uint32_t* column_hist_offsets_full,
   const data_size_t num_data,
-  float* global_hist_buffer) {
+  HIST_TYPE* global_hist_buffer) {
   const int dim_y = static_cast<int>(gridDim.y * blockDim.y);
   const data_size_t num_data_in_smaller_leaf = smaller_leaf_splits->num_data_in_leaf;
   const data_size_t num_data_per_thread = (num_data_in_smaller_leaf + dim_y - 1) / dim_y;
@@ -209,7 +209,7 @@ __global__ void CUDAConstructHistogramSparseKernel_GlobalMemory(
   const uint32_t num_items_in_partition = (partition_hist_end - partition_hist_start) << 1;
   const unsigned int thread_idx = threadIdx.x + threadIdx.y * blockDim.x;
   const int num_total_bin = column_hist_offsets_full[gridDim.x];
-  float* shared_hist = global_hist_buffer + (blockIdx.y * num_total_bin + partition_hist_start) * 2;
+  HIST_TYPE* shared_hist = global_hist_buffer + (blockIdx.y * num_total_bin + partition_hist_start) * 2;
   for (unsigned int i = thread_idx; i < num_items_in_partition; i += num_threads_per_block) {
     shared_hist[i] = 0.0f;
   }
@@ -233,7 +233,7 @@ __global__ void CUDAConstructHistogramSparseKernel_GlobalMemory(
       const score_t hess = cuda_hessians[data_index];
       const uint32_t bin = static_cast<uint32_t>(data_ptr[row_start + threadIdx.x]);
       const uint32_t pos = bin << 1;
-      float* pos_ptr = shared_hist + pos;
+      HIST_TYPE* pos_ptr = shared_hist + pos;
       atomicAdd_block(pos_ptr, grad);
       atomicAdd_block(pos_ptr + 1, hess);
     }
@@ -246,13 +246,278 @@ __global__ void CUDAConstructHistogramSparseKernel_GlobalMemory(
   }
 }
 
+template <typename BIN_TYPE, int SHARED_HIST_SIZE, bool USE_16BIT_HIST>
+__global__ void CUDAConstructDiscretizedHistogramDenseKernel(
+  const CUDALeafSplitsStruct* smaller_leaf_splits,
+  const int32_t* cuda_gradients_and_hessians,
+  const BIN_TYPE* data,
+  const uint32_t* column_hist_offsets,
+  const uint32_t* column_hist_offsets_full,
+  const int* feature_partition_column_index_offsets,
+  const data_size_t num_data) {
+  const int dim_y = static_cast<int>(gridDim.y * blockDim.y);
+  const data_size_t num_data_in_smaller_leaf = smaller_leaf_splits->num_data_in_leaf;
+  const data_size_t num_data_per_thread = (num_data_in_smaller_leaf + dim_y - 1) / dim_y;
+  const data_size_t* data_indices_ref = smaller_leaf_splits->data_indices_in_leaf;
+  __shared__ int16_t shared_hist[SHARED_HIST_SIZE];
+  int32_t* shared_hist_packed = reinterpret_cast<int32_t*>(shared_hist);
+  const unsigned int num_threads_per_block = blockDim.x * blockDim.y;
+  const int partition_column_start = feature_partition_column_index_offsets[blockIdx.x];
+  const int partition_column_end = feature_partition_column_index_offsets[blockIdx.x + 1];
+  const BIN_TYPE* data_ptr = data + partition_column_start * num_data;
+  const int num_columns_in_partition = partition_column_end - partition_column_start;
+  const uint32_t partition_hist_start = column_hist_offsets_full[blockIdx.x];
+  const uint32_t partition_hist_end = column_hist_offsets_full[blockIdx.x + 1];
+  const uint32_t num_items_in_partition = (partition_hist_end - partition_hist_start);
+  const unsigned int thread_idx = threadIdx.x + threadIdx.y * blockDim.x;
+  for (unsigned int i = thread_idx; i < num_items_in_partition; i += num_threads_per_block) {
+    shared_hist_packed[i] = 0;
+  }
+  __syncthreads();
+  const unsigned int threadIdx_y = threadIdx.y;
+  const unsigned int blockIdx_y = blockIdx.y;
+  const data_size_t block_start = (blockIdx_y * blockDim.y) * num_data_per_thread;
+  const data_size_t* data_indices_ref_this_block = data_indices_ref + block_start;
+  data_size_t block_num_data = max(0, min(num_data_in_smaller_leaf - block_start, num_data_per_thread * static_cast<data_size_t>(blockDim.y)));
+  const data_size_t num_iteration_total = (block_num_data + blockDim.y - 1) / blockDim.y;
+  const data_size_t remainder = block_num_data % blockDim.y;
+  const data_size_t num_iteration_this = remainder == 0 ? num_iteration_total : num_iteration_total - static_cast<data_size_t>(threadIdx_y >= remainder);
+  data_size_t inner_data_index = static_cast<data_size_t>(threadIdx_y);
+  const int column_index = static_cast<int>(threadIdx.x) + partition_column_start;
+  if (threadIdx.x < static_cast<unsigned int>(num_columns_in_partition)) {
+    int32_t* shared_hist_ptr = shared_hist_packed + (column_hist_offsets[column_index]);
+    for (data_size_t i = 0; i < num_iteration_this; ++i) {
+      const data_size_t data_index = data_indices_ref_this_block[inner_data_index];
+      const int32_t grad_and_hess = cuda_gradients_and_hessians[data_index];
+      const uint32_t bin = static_cast<uint32_t>(data_ptr[data_index * num_columns_in_partition + threadIdx.x]);
+      int32_t* pos_ptr = shared_hist_ptr + bin;
+      atomicAdd_block(pos_ptr, grad_and_hess);
+      inner_data_index += blockDim.y;
+    }
+  }
+  __syncthreads();
+  if (USE_16BIT_HIST) {
+    int32_t* feature_histogram_ptr = reinterpret_cast<int32_t*>(smaller_leaf_splits->hist_in_leaf) + partition_hist_start;
+    for (unsigned int i = thread_idx; i < num_items_in_partition; i += num_threads_per_block) {
+      const int32_t packed_grad_hess = shared_hist_packed[i];
+      atomicAdd_system(feature_histogram_ptr + i, packed_grad_hess);
+    }
+  } else {
+    atomic_add_long_t* feature_histogram_ptr = reinterpret_cast<atomic_add_long_t*>(smaller_leaf_splits->hist_in_leaf) + partition_hist_start;
+    for (unsigned int i = thread_idx; i < num_items_in_partition; i += num_threads_per_block) {
+      const int32_t packed_grad_hess = shared_hist_packed[i];
+      const int64_t packed_grad_hess_int64 = (static_cast<int64_t>(static_cast<int16_t>(packed_grad_hess >> 16)) << 32) | (static_cast<int64_t>(packed_grad_hess & 0x0000ffff));
+      atomicAdd_system(feature_histogram_ptr + i, (atomic_add_long_t)(packed_grad_hess_int64));
+    }
+  }
+}
+
+template <typename BIN_TYPE, typename DATA_PTR_TYPE, int SHARED_HIST_SIZE, bool USE_16BIT_HIST>
+__global__ void CUDAConstructDiscretizedHistogramSparseKernel(
+  const CUDALeafSplitsStruct* smaller_leaf_splits,
+  const int32_t* cuda_gradients_and_hessians,
+  const BIN_TYPE* data,
+  const DATA_PTR_TYPE* row_ptr,
+  const DATA_PTR_TYPE* partition_ptr,
+  const uint32_t* column_hist_offsets_full,
+  const data_size_t num_data) {
+  const int dim_y = static_cast<int>(gridDim.y * blockDim.y);
+  const data_size_t num_data_in_smaller_leaf = smaller_leaf_splits->num_data_in_leaf;
+  const data_size_t num_data_per_thread = (num_data_in_smaller_leaf + dim_y - 1) / dim_y;
+  const data_size_t* data_indices_ref = smaller_leaf_splits->data_indices_in_leaf;
+  __shared__ int16_t shared_hist[SHARED_HIST_SIZE];
+  int32_t* shared_hist_packed = reinterpret_cast<int32_t*>(shared_hist);
+  const unsigned int num_threads_per_block = blockDim.x * blockDim.y;
+  const DATA_PTR_TYPE* block_row_ptr = row_ptr + blockIdx.x * (num_data + 1);
+  const BIN_TYPE* data_ptr = data + partition_ptr[blockIdx.x];
+  const uint32_t partition_hist_start = column_hist_offsets_full[blockIdx.x];
+  const uint32_t partition_hist_end = column_hist_offsets_full[blockIdx.x + 1];
+  const uint32_t num_items_in_partition = (partition_hist_end - partition_hist_start);
+  const unsigned int thread_idx = threadIdx.x + threadIdx.y * blockDim.x;
+  for (unsigned int i = thread_idx; i < num_items_in_partition; i += num_threads_per_block) {
+    shared_hist_packed[i] = 0.0f;
+  }
+  __syncthreads();
+  const unsigned int threadIdx_y = threadIdx.y;
+  const unsigned int blockIdx_y = blockIdx.y;
+  const data_size_t block_start = (blockIdx_y * blockDim.y) * num_data_per_thread;
+  const data_size_t* data_indices_ref_this_block = data_indices_ref + block_start;
+  data_size_t block_num_data = max(0, min(num_data_in_smaller_leaf - block_start, num_data_per_thread * static_cast<data_size_t>(blockDim.y)));
+  const data_size_t num_iteration_total = (block_num_data + blockDim.y - 1) / blockDim.y;
+  const data_size_t remainder = block_num_data % blockDim.y;
+  const data_size_t num_iteration_this = remainder == 0 ? num_iteration_total : num_iteration_total - static_cast<data_size_t>(threadIdx_y >= remainder);
+  data_size_t inner_data_index = static_cast<data_size_t>(threadIdx_y);
+  for (data_size_t i = 0; i < num_iteration_this; ++i) {
+    const data_size_t data_index = data_indices_ref_this_block[inner_data_index];
+    const DATA_PTR_TYPE row_start = block_row_ptr[data_index];
+    const DATA_PTR_TYPE row_end = block_row_ptr[data_index + 1];
+    const DATA_PTR_TYPE row_size = row_end - row_start;
+    if (threadIdx.x < row_size) {
+      const int32_t grad_and_hess = cuda_gradients_and_hessians[data_index];
+      const uint32_t bin = static_cast<uint32_t>(data_ptr[row_start + threadIdx.x]);
+      int32_t* pos_ptr = shared_hist_packed + bin;
+      atomicAdd_block(pos_ptr, grad_and_hess);
+    }
+    inner_data_index += blockDim.y;
+  }
+  __syncthreads();
+  if (USE_16BIT_HIST) {
+    int32_t* feature_histogram_ptr = reinterpret_cast<int32_t*>(smaller_leaf_splits->hist_in_leaf) + partition_hist_start;
+    for (unsigned int i = thread_idx; i < num_items_in_partition; i += num_threads_per_block) {
+      const int32_t packed_grad_hess = shared_hist_packed[i];
+      atomicAdd_system(feature_histogram_ptr + i, packed_grad_hess);
+    }
+  } else {
+    atomic_add_long_t* feature_histogram_ptr = reinterpret_cast<atomic_add_long_t*>(smaller_leaf_splits->hist_in_leaf) + partition_hist_start;
+    for (unsigned int i = thread_idx; i < num_items_in_partition; i += num_threads_per_block) {
+      const int32_t packed_grad_hess = shared_hist_packed[i];
+      const int64_t packed_grad_hess_int64 = (static_cast<int64_t>(static_cast<int16_t>(packed_grad_hess >> 16)) << 32) | (static_cast<int64_t>(packed_grad_hess & 0x0000ffff));
+      atomicAdd_system(feature_histogram_ptr + i, (atomic_add_long_t)(packed_grad_hess_int64));
+    }
+  }
+}
+
+template <typename BIN_TYPE, int SHARED_HIST_SIZE, bool USE_16BIT_HIST>
+__global__ void CUDAConstructDiscretizedHistogramDenseKernel_GlobalMemory(
+  const CUDALeafSplitsStruct* smaller_leaf_splits,
+  const int32_t* cuda_gradients_and_hessians,
+  const BIN_TYPE* data,
+  const uint32_t* column_hist_offsets,
+  const uint32_t* column_hist_offsets_full,
+  const int* feature_partition_column_index_offsets,
+  const data_size_t num_data,
+  int32_t* global_hist_buffer) {
+  const int dim_y = static_cast<int>(gridDim.y * blockDim.y);
+  const data_size_t num_data_in_smaller_leaf = smaller_leaf_splits->num_data_in_leaf;
+  const data_size_t num_data_per_thread = (num_data_in_smaller_leaf + dim_y - 1) / dim_y;
+  const data_size_t* data_indices_ref = smaller_leaf_splits->data_indices_in_leaf;
+  const unsigned int num_threads_per_block = blockDim.x * blockDim.y;
+  const int partition_column_start = feature_partition_column_index_offsets[blockIdx.x];
+  const int partition_column_end = feature_partition_column_index_offsets[blockIdx.x + 1];
+  const BIN_TYPE* data_ptr = data + partition_column_start * num_data;
+  const int num_columns_in_partition = partition_column_end - partition_column_start;
+  const uint32_t partition_hist_start = column_hist_offsets_full[blockIdx.x];
+  const uint32_t partition_hist_end = column_hist_offsets_full[blockIdx.x + 1];
+  const uint32_t num_items_in_partition = (partition_hist_end - partition_hist_start);
+  const int num_total_bin = column_hist_offsets_full[gridDim.x];
+  int32_t* shared_hist_packed = global_hist_buffer + (blockIdx.y * num_total_bin + partition_column_start);
+  const unsigned int thread_idx = threadIdx.x + threadIdx.y * blockDim.x;
+  for (unsigned int i = thread_idx; i < num_items_in_partition; i += num_threads_per_block) {
+    shared_hist_packed[i] = 0;
+  }
+  __syncthreads();
+  const unsigned int threadIdx_y = threadIdx.y;
+  const unsigned int blockIdx_y = blockIdx.y;
+  const data_size_t block_start = (blockIdx_y * blockDim.y) * num_data_per_thread;
+  const data_size_t* data_indices_ref_this_block = data_indices_ref + block_start;
+  data_size_t block_num_data = max(0, min(num_data_in_smaller_leaf - block_start, num_data_per_thread * static_cast<data_size_t>(blockDim.y)));
+  const data_size_t num_iteration_total = (block_num_data + blockDim.y - 1) / blockDim.y;
+  const data_size_t remainder = block_num_data % blockDim.y;
+  const data_size_t num_iteration_this = remainder == 0 ? num_iteration_total : num_iteration_total - static_cast<data_size_t>(threadIdx_y >= remainder);
+  data_size_t inner_data_index = static_cast<data_size_t>(threadIdx_y);
+  const int column_index = static_cast<int>(threadIdx.x) + partition_column_start;
+  if (threadIdx.x < static_cast<unsigned int>(num_columns_in_partition)) {
+    int32_t* shared_hist_ptr = shared_hist_packed + (column_hist_offsets[column_index]);
+    for (data_size_t i = 0; i < num_iteration_this; ++i) {
+      const data_size_t data_index = data_indices_ref_this_block[inner_data_index];
+      const int32_t grad_and_hess = cuda_gradients_and_hessians[data_index];
+      const uint32_t bin = static_cast<uint32_t>(data_ptr[data_index * num_columns_in_partition + threadIdx.x]);
+      int32_t* pos_ptr = shared_hist_ptr + bin;
+      atomicAdd_block(pos_ptr, grad_and_hess);
+      inner_data_index += blockDim.y;
+    }
+  }
+  __syncthreads();
+  if (USE_16BIT_HIST) {
+    int32_t* feature_histogram_ptr = reinterpret_cast<int32_t*>(smaller_leaf_splits->hist_in_leaf) + partition_hist_start;
+    for (unsigned int i = thread_idx; i < num_items_in_partition; i += num_threads_per_block) {
+      const int32_t packed_grad_hess = shared_hist_packed[i];
+      atomicAdd_system(feature_histogram_ptr + i, packed_grad_hess);
+    }
+  } else {
+    atomic_add_long_t* feature_histogram_ptr = reinterpret_cast<atomic_add_long_t*>(smaller_leaf_splits->hist_in_leaf) + partition_hist_start;
+    for (unsigned int i = thread_idx; i < num_items_in_partition; i += num_threads_per_block) {
+      const int32_t packed_grad_hess = shared_hist_packed[i];
+      const int64_t packed_grad_hess_int64 = (static_cast<int64_t>(static_cast<int16_t>(packed_grad_hess >> 16)) << 32) | (static_cast<int64_t>(packed_grad_hess & 0x0000ffff));
+      atomicAdd_system(feature_histogram_ptr + i, (atomic_add_long_t)(packed_grad_hess_int64));
+    }
+  }
+}
+
+template <typename BIN_TYPE, typename DATA_PTR_TYPE, int SHARED_HIST_SIZE, bool USE_16BIT_HIST>
+__global__ void CUDAConstructDiscretizedHistogramSparseKernel_GlobalMemory(
+  const CUDALeafSplitsStruct* smaller_leaf_splits,
+  const int32_t* cuda_gradients_and_hessians,
+  const BIN_TYPE* data,
+  const DATA_PTR_TYPE* row_ptr,
+  const DATA_PTR_TYPE* partition_ptr,
+  const uint32_t* column_hist_offsets_full,
+  const data_size_t num_data,
+  int32_t* global_hist_buffer) {
+  const int dim_y = static_cast<int>(gridDim.y * blockDim.y);
+  const data_size_t num_data_in_smaller_leaf = smaller_leaf_splits->num_data_in_leaf;
+  const data_size_t num_data_per_thread = (num_data_in_smaller_leaf + dim_y - 1) / dim_y;
+  const data_size_t* data_indices_ref = smaller_leaf_splits->data_indices_in_leaf;
+  const int num_total_bin = column_hist_offsets_full[gridDim.x];
+  const unsigned int num_threads_per_block = blockDim.x * blockDim.y;
+  const DATA_PTR_TYPE* block_row_ptr = row_ptr + blockIdx.x * (num_data + 1);
+  const BIN_TYPE* data_ptr = data + partition_ptr[blockIdx.x];
+  const uint32_t partition_hist_start = column_hist_offsets_full[blockIdx.x];
+  const uint32_t partition_hist_end = column_hist_offsets_full[blockIdx.x + 1];
+  const uint32_t num_items_in_partition = (partition_hist_end - partition_hist_start);
+  const unsigned int thread_idx = threadIdx.x + threadIdx.y * blockDim.x;
+  int32_t* shared_hist_packed = global_hist_buffer + (blockIdx.y * num_total_bin + partition_hist_start);
+  for (unsigned int i = thread_idx; i < num_items_in_partition; i += num_threads_per_block) {
+    shared_hist_packed[i] = 0.0f;
+  }
+  __syncthreads();
+  const unsigned int threadIdx_y = threadIdx.y;
+  const unsigned int blockIdx_y = blockIdx.y;
+  const data_size_t block_start = (blockIdx_y * blockDim.y) * num_data_per_thread;
+  const data_size_t* data_indices_ref_this_block = data_indices_ref + block_start;
+  data_size_t block_num_data = max(0, min(num_data_in_smaller_leaf - block_start, num_data_per_thread * static_cast<data_size_t>(blockDim.y)));
+  const data_size_t num_iteration_total = (block_num_data + blockDim.y - 1) / blockDim.y;
+  const data_size_t remainder = block_num_data % blockDim.y;
+  const data_size_t num_iteration_this = remainder == 0 ? num_iteration_total : num_iteration_total - static_cast<data_size_t>(threadIdx_y >= remainder);
+  data_size_t inner_data_index = static_cast<data_size_t>(threadIdx_y);
+  for (data_size_t i = 0; i < num_iteration_this; ++i) {
+    const data_size_t data_index = data_indices_ref_this_block[inner_data_index];
+    const DATA_PTR_TYPE row_start = block_row_ptr[data_index];
+    const DATA_PTR_TYPE row_end = block_row_ptr[data_index + 1];
+    const DATA_PTR_TYPE row_size = row_end - row_start;
+    if (threadIdx.x < row_size) {
+      const int32_t grad_and_hess = cuda_gradients_and_hessians[data_index];
+      const uint32_t bin = static_cast<uint32_t>(data_ptr[row_start + threadIdx.x]);
+      int32_t* pos_ptr = shared_hist_packed + bin;
+      atomicAdd_block(pos_ptr, grad_and_hess);
+    }
+    inner_data_index += blockDim.y;
+  }
+  __syncthreads();
+  if (USE_16BIT_HIST) {
+    int32_t* feature_histogram_ptr = reinterpret_cast<int32_t*>(smaller_leaf_splits->hist_in_leaf) + partition_hist_start;
+    for (unsigned int i = thread_idx; i < num_items_in_partition; i += num_threads_per_block) {
+      const int32_t packed_grad_hess = shared_hist_packed[i];
+      atomicAdd_system(feature_histogram_ptr + i, packed_grad_hess);
+    }
+  } else {
+    atomic_add_long_t* feature_histogram_ptr = reinterpret_cast<atomic_add_long_t*>(smaller_leaf_splits->hist_in_leaf) + partition_hist_start;
+    for (unsigned int i = thread_idx; i < num_items_in_partition; i += num_threads_per_block) {
+      const int32_t packed_grad_hess = shared_hist_packed[i];
+      const int64_t packed_grad_hess_int64 = (static_cast<int64_t>(static_cast<int16_t>(packed_grad_hess >> 16)) << 32) | (static_cast<int64_t>(packed_grad_hess & 0x0000ffff));
+      atomicAdd_system(feature_histogram_ptr + i, (atomic_add_long_t)(packed_grad_hess_int64));
+    }
+  }
+}
+
 void CUDAHistogramConstructor::LaunchConstructHistogramKernel(
   const CUDALeafSplitsStruct* cuda_smaller_leaf_splits,
-  const data_size_t num_data_in_smaller_leaf) {
+  const data_size_t num_data_in_smaller_leaf,
+  const uint8_t num_bits_in_histogram_bins) {
   if (cuda_row_data_->shared_hist_size() == DP_SHARED_HIST_SIZE && gpu_use_dp_) {
-    LaunchConstructHistogramKernelInner<double, DP_SHARED_HIST_SIZE>(cuda_smaller_leaf_splits, num_data_in_smaller_leaf);
+    LaunchConstructHistogramKernelInner<double, DP_SHARED_HIST_SIZE>(cuda_smaller_leaf_splits, num_data_in_smaller_leaf, num_bits_in_histogram_bins);
   } else if (cuda_row_data_->shared_hist_size() == SP_SHARED_HIST_SIZE && !gpu_use_dp_) {
-    LaunchConstructHistogramKernelInner<float, SP_SHARED_HIST_SIZE>(cuda_smaller_leaf_splits, num_data_in_smaller_leaf);
+    LaunchConstructHistogramKernelInner<float, SP_SHARED_HIST_SIZE>(cuda_smaller_leaf_splits, num_data_in_smaller_leaf, num_bits_in_histogram_bins);
   } else {
     Log::Fatal("Unknown shared histogram size %d", cuda_row_data_->shared_hist_size());
   }
@@ -261,13 +526,14 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernel(
 template <typename HIST_TYPE, size_t SHARED_HIST_SIZE>
 void CUDAHistogramConstructor::LaunchConstructHistogramKernelInner(
   const CUDALeafSplitsStruct* cuda_smaller_leaf_splits,
-  const data_size_t num_data_in_smaller_leaf) {
+  const data_size_t num_data_in_smaller_leaf,
+  const uint8_t num_bits_in_histogram_bins) {
   if (cuda_row_data_->bit_type() == 8) {
-    LaunchConstructHistogramKernelInner0<HIST_TYPE, SHARED_HIST_SIZE, uint8_t>(cuda_smaller_leaf_splits, num_data_in_smaller_leaf);
+    LaunchConstructHistogramKernelInner0<HIST_TYPE, SHARED_HIST_SIZE, uint8_t>(cuda_smaller_leaf_splits, num_data_in_smaller_leaf, num_bits_in_histogram_bins);
   } else if (cuda_row_data_->bit_type() == 16) {
-    LaunchConstructHistogramKernelInner0<HIST_TYPE, SHARED_HIST_SIZE, uint16_t>(cuda_smaller_leaf_splits, num_data_in_smaller_leaf);
+    LaunchConstructHistogramKernelInner0<HIST_TYPE, SHARED_HIST_SIZE, uint16_t>(cuda_smaller_leaf_splits, num_data_in_smaller_leaf, num_bits_in_histogram_bins);
   } else if (cuda_row_data_->bit_type() == 32) {
-    LaunchConstructHistogramKernelInner0<HIST_TYPE, SHARED_HIST_SIZE, uint32_t>(cuda_smaller_leaf_splits, num_data_in_smaller_leaf);
+    LaunchConstructHistogramKernelInner0<HIST_TYPE, SHARED_HIST_SIZE, uint32_t>(cuda_smaller_leaf_splits, num_data_in_smaller_leaf, num_bits_in_histogram_bins);
   } else {
     Log::Fatal("Unknown bit_type = %d", cuda_row_data_->bit_type());
   }
@@ -276,16 +542,17 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernelInner(
 template <typename HIST_TYPE, size_t SHARED_HIST_SIZE, typename BIN_TYPE>
 void CUDAHistogramConstructor::LaunchConstructHistogramKernelInner0(
   const CUDALeafSplitsStruct* cuda_smaller_leaf_splits,
-  const data_size_t num_data_in_smaller_leaf) {
+  const data_size_t num_data_in_smaller_leaf,
+  const uint8_t num_bits_in_histogram_bins) {
   if (cuda_row_data_->row_ptr_bit_type() == 16) {
-    LaunchConstructHistogramKernelInner1<HIST_TYPE, SHARED_HIST_SIZE, BIN_TYPE, uint16_t>(cuda_smaller_leaf_splits, num_data_in_smaller_leaf);
+    LaunchConstructHistogramKernelInner1<HIST_TYPE, SHARED_HIST_SIZE, BIN_TYPE, uint16_t>(cuda_smaller_leaf_splits, num_data_in_smaller_leaf, num_bits_in_histogram_bins);
   } else if (cuda_row_data_->row_ptr_bit_type() == 32) {
-    LaunchConstructHistogramKernelInner1<HIST_TYPE, SHARED_HIST_SIZE, BIN_TYPE, uint32_t>(cuda_smaller_leaf_splits, num_data_in_smaller_leaf);
+    LaunchConstructHistogramKernelInner1<HIST_TYPE, SHARED_HIST_SIZE, BIN_TYPE, uint32_t>(cuda_smaller_leaf_splits, num_data_in_smaller_leaf, num_bits_in_histogram_bins);
   } else if (cuda_row_data_->row_ptr_bit_type() == 64) {
-    LaunchConstructHistogramKernelInner1<HIST_TYPE, SHARED_HIST_SIZE, BIN_TYPE, uint64_t>(cuda_smaller_leaf_splits, num_data_in_smaller_leaf);
+    LaunchConstructHistogramKernelInner1<HIST_TYPE, SHARED_HIST_SIZE, BIN_TYPE, uint64_t>(cuda_smaller_leaf_splits, num_data_in_smaller_leaf, num_bits_in_histogram_bins);
   } else {
     if (!cuda_row_data_->is_sparse()) {
-      LaunchConstructHistogramKernelInner1<HIST_TYPE, SHARED_HIST_SIZE, BIN_TYPE, uint16_t>(cuda_smaller_leaf_splits, num_data_in_smaller_leaf);
+      LaunchConstructHistogramKernelInner1<HIST_TYPE, SHARED_HIST_SIZE, BIN_TYPE, uint16_t>(cuda_smaller_leaf_splits, num_data_in_smaller_leaf, num_bits_in_histogram_bins);
     } else {
       Log::Fatal("Unknown row_ptr_bit_type = %d", cuda_row_data_->row_ptr_bit_type());
     }
@@ -295,18 +562,20 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernelInner0(
 template <typename HIST_TYPE, size_t SHARED_HIST_SIZE, typename BIN_TYPE, typename PTR_TYPE>
 void CUDAHistogramConstructor::LaunchConstructHistogramKernelInner1(
   const CUDALeafSplitsStruct* cuda_smaller_leaf_splits,
-  const data_size_t num_data_in_smaller_leaf) {
+  const data_size_t num_data_in_smaller_leaf,
+  const uint8_t num_bits_in_histogram_bins) {
   if (cuda_row_data_->NumLargeBinPartition() == 0) {
-    LaunchConstructHistogramKernelInner2<HIST_TYPE, SHARED_HIST_SIZE, BIN_TYPE, PTR_TYPE, false>(cuda_smaller_leaf_splits, num_data_in_smaller_leaf);
+    LaunchConstructHistogramKernelInner2<HIST_TYPE, SHARED_HIST_SIZE, BIN_TYPE, PTR_TYPE, false>(cuda_smaller_leaf_splits, num_data_in_smaller_leaf, num_bits_in_histogram_bins);
   } else {
-    LaunchConstructHistogramKernelInner2<HIST_TYPE, SHARED_HIST_SIZE, BIN_TYPE, PTR_TYPE, true>(cuda_smaller_leaf_splits, num_data_in_smaller_leaf);
+    LaunchConstructHistogramKernelInner2<HIST_TYPE, SHARED_HIST_SIZE, BIN_TYPE, PTR_TYPE, true>(cuda_smaller_leaf_splits, num_data_in_smaller_leaf, num_bits_in_histogram_bins);
   }
 }
 
 template <typename HIST_TYPE, size_t SHARED_HIST_SIZE, typename BIN_TYPE, typename PTR_TYPE, bool USE_GLOBAL_MEM_BUFFER>
 void CUDAHistogramConstructor::LaunchConstructHistogramKernelInner2(
   const CUDALeafSplitsStruct* cuda_smaller_leaf_splits,
-  const data_size_t num_data_in_smaller_leaf) {
+  const data_size_t num_data_in_smaller_leaf,
+  const uint8_t num_bits_in_histogram_bins) {
   int grid_dim_x = 0;
   int grid_dim_y = 0;
   int block_dim_x = 0;
@@ -314,47 +583,139 @@ void CUDAHistogramConstructor::LaunchConstructHistogramKernelInner2(
   CalcConstructHistogramKernelDim(&grid_dim_x, &grid_dim_y, &block_dim_x, &block_dim_y, num_data_in_smaller_leaf);
   dim3 grid_dim(grid_dim_x, grid_dim_y);
   dim3 block_dim(block_dim_x, block_dim_y);
-  if (!USE_GLOBAL_MEM_BUFFER) {
-    if (cuda_row_data_->is_sparse()) {
-      CUDAConstructHistogramSparseKernel<BIN_TYPE, PTR_TYPE, HIST_TYPE, SHARED_HIST_SIZE><<<grid_dim, block_dim, 0, cuda_stream_>>>(
-        cuda_smaller_leaf_splits,
-        cuda_gradients_, cuda_hessians_,
-        cuda_row_data_->GetBin<BIN_TYPE>(),
-        cuda_row_data_->GetRowPtr<PTR_TYPE>(),
-        cuda_row_data_->GetPartitionPtr<PTR_TYPE>(),
-        cuda_row_data_->cuda_partition_hist_offsets(),
-        num_data_);
+  if (use_quantized_grad_) {
+    if (USE_GLOBAL_MEM_BUFFER) {
+      if (cuda_row_data_->is_sparse()) {
+        if (num_bits_in_histogram_bins <= 16) {
+          CUDAConstructDiscretizedHistogramSparseKernel_GlobalMemory<BIN_TYPE, PTR_TYPE, SHARED_HIST_SIZE, true><<<grid_dim, block_dim, 0, cuda_stream_>>>(
+            cuda_smaller_leaf_splits,
+            reinterpret_cast<const int32_t*>(cuda_gradients_),
+            cuda_row_data_->GetBin<BIN_TYPE>(),
+            cuda_row_data_->GetRowPtr<PTR_TYPE>(),
+            cuda_row_data_->GetPartitionPtr<PTR_TYPE>(),
+            cuda_row_data_->cuda_partition_hist_offsets(),
+            num_data_,
+            reinterpret_cast<int32_t*>(cuda_hist_buffer_.RawData()));
+        } else {
+          CUDAConstructDiscretizedHistogramSparseKernel_GlobalMemory<BIN_TYPE, PTR_TYPE, SHARED_HIST_SIZE, false><<<grid_dim, block_dim, 0, cuda_stream_>>>(
+            cuda_smaller_leaf_splits,
+            reinterpret_cast<const int32_t*>(cuda_gradients_),
+            cuda_row_data_->GetBin<BIN_TYPE>(),
+            cuda_row_data_->GetRowPtr<PTR_TYPE>(),
+            cuda_row_data_->GetPartitionPtr<PTR_TYPE>(),
+            cuda_row_data_->cuda_partition_hist_offsets(),
+            num_data_,
+            reinterpret_cast<int32_t*>(cuda_hist_buffer_.RawData()));
+        }
+      } else {
+        if (num_bits_in_histogram_bins <= 16) {
+          CUDAConstructDiscretizedHistogramDenseKernel_GlobalMemory<BIN_TYPE, SHARED_HIST_SIZE, true><<<grid_dim, block_dim, 0, cuda_stream_>>>(
+            cuda_smaller_leaf_splits,
+            reinterpret_cast<const int32_t*>(cuda_gradients_),
+            cuda_row_data_->GetBin<BIN_TYPE>(),
+            cuda_row_data_->cuda_column_hist_offsets(),
+            cuda_row_data_->cuda_partition_hist_offsets(),
+            cuda_row_data_->cuda_feature_partition_column_index_offsets(),
+            num_data_,
+            reinterpret_cast<int32_t*>(cuda_hist_buffer_.RawData()));
+        } else {
+          CUDAConstructDiscretizedHistogramDenseKernel_GlobalMemory<BIN_TYPE, SHARED_HIST_SIZE, false><<<grid_dim, block_dim, 0, cuda_stream_>>>(
+            cuda_smaller_leaf_splits,
+            reinterpret_cast<const int32_t*>(cuda_gradients_),
+            cuda_row_data_->GetBin<BIN_TYPE>(),
+            cuda_row_data_->cuda_column_hist_offsets(),
+            cuda_row_data_->cuda_partition_hist_offsets(),
+            cuda_row_data_->cuda_feature_partition_column_index_offsets(),
+            num_data_,
+            reinterpret_cast<int32_t*>(cuda_hist_buffer_.RawData()));
+        }
+      }
     } else {
-      CUDAConstructHistogramDenseKernel<BIN_TYPE, HIST_TYPE, SHARED_HIST_SIZE><<<grid_dim, block_dim, 0, cuda_stream_>>>(
-        cuda_smaller_leaf_splits,
-        cuda_gradients_, cuda_hessians_,
-        cuda_row_data_->GetBin<BIN_TYPE>(),
-        cuda_row_data_->cuda_column_hist_offsets(),
-        cuda_row_data_->cuda_partition_hist_offsets(),
-        cuda_row_data_->cuda_feature_partition_column_index_offsets(),
-        num_data_);
+      if (cuda_row_data_->is_sparse()) {
+        if (num_bits_in_histogram_bins <= 16) {
+          CUDAConstructDiscretizedHistogramSparseKernel<BIN_TYPE, PTR_TYPE, SHARED_HIST_SIZE, true><<<grid_dim, block_dim, 0, cuda_stream_>>>(
+            cuda_smaller_leaf_splits,
+            reinterpret_cast<const int32_t*>(cuda_gradients_),
+            cuda_row_data_->GetBin<BIN_TYPE>(),
+            cuda_row_data_->GetRowPtr<PTR_TYPE>(),
+            cuda_row_data_->GetPartitionPtr<PTR_TYPE>(),
+            cuda_row_data_->cuda_partition_hist_offsets(),
+            num_data_);
+        } else {
+          CUDAConstructDiscretizedHistogramSparseKernel<BIN_TYPE, PTR_TYPE, SHARED_HIST_SIZE, false><<<grid_dim, block_dim, 0, cuda_stream_>>>(
+            cuda_smaller_leaf_splits,
+            reinterpret_cast<const int32_t*>(cuda_gradients_),
+            cuda_row_data_->GetBin<BIN_TYPE>(),
+            cuda_row_data_->GetRowPtr<PTR_TYPE>(),
+            cuda_row_data_->GetPartitionPtr<PTR_TYPE>(),
+            cuda_row_data_->cuda_partition_hist_offsets(),
+            num_data_);
+        }
+      } else {
+        if (num_bits_in_histogram_bins <= 16) {
+          CUDAConstructDiscretizedHistogramDenseKernel<BIN_TYPE, SHARED_HIST_SIZE, true><<<grid_dim, block_dim, 0, cuda_stream_>>>(
+            cuda_smaller_leaf_splits,
+            reinterpret_cast<const int32_t*>(cuda_gradients_),
+            cuda_row_data_->GetBin<BIN_TYPE>(),
+            cuda_row_data_->cuda_column_hist_offsets(),
+            cuda_row_data_->cuda_partition_hist_offsets(),
+            cuda_row_data_->cuda_feature_partition_column_index_offsets(),
+            num_data_);
+        } else {
+          CUDAConstructDiscretizedHistogramDenseKernel<BIN_TYPE, SHARED_HIST_SIZE, false><<<grid_dim, block_dim, 0, cuda_stream_>>>(
+            cuda_smaller_leaf_splits,
+            reinterpret_cast<const int32_t*>(cuda_gradients_),
+            cuda_row_data_->GetBin<BIN_TYPE>(),
+            cuda_row_data_->cuda_column_hist_offsets(),
+            cuda_row_data_->cuda_partition_hist_offsets(),
+            cuda_row_data_->cuda_feature_partition_column_index_offsets(),
+            num_data_);
+        }
+      }
     }
   } else {
-    if (cuda_row_data_->is_sparse()) {
-      CUDAConstructHistogramSparseKernel_GlobalMemory<BIN_TYPE, PTR_TYPE><<<grid_dim, block_dim, 0, cuda_stream_>>>(
-        cuda_smaller_leaf_splits,
-        cuda_gradients_, cuda_hessians_,
-        cuda_row_data_->GetBin<BIN_TYPE>(),
-        cuda_row_data_->GetRowPtr<PTR_TYPE>(),
-        cuda_row_data_->GetPartitionPtr<PTR_TYPE>(),
-        cuda_row_data_->cuda_partition_hist_offsets(),
-        num_data_,
-        cuda_hist_buffer_);
+    if (!USE_GLOBAL_MEM_BUFFER) {
+      if (cuda_row_data_->is_sparse()) {
+        CUDAConstructHistogramSparseKernel<BIN_TYPE, PTR_TYPE, HIST_TYPE, SHARED_HIST_SIZE><<<grid_dim, block_dim, 0, cuda_stream_>>>(
+          cuda_smaller_leaf_splits,
+          cuda_gradients_, cuda_hessians_,
+          cuda_row_data_->GetBin<BIN_TYPE>(),
+          cuda_row_data_->GetRowPtr<PTR_TYPE>(),
+          cuda_row_data_->GetPartitionPtr<PTR_TYPE>(),
+          cuda_row_data_->cuda_partition_hist_offsets(),
+          num_data_);
+      } else {
+        CUDAConstructHistogramDenseKernel<BIN_TYPE, HIST_TYPE, SHARED_HIST_SIZE><<<grid_dim, block_dim, 0, cuda_stream_>>>(
+          cuda_smaller_leaf_splits,
+          cuda_gradients_, cuda_hessians_,
+          cuda_row_data_->GetBin<BIN_TYPE>(),
+          cuda_row_data_->cuda_column_hist_offsets(),
+          cuda_row_data_->cuda_partition_hist_offsets(),
+          cuda_row_data_->cuda_feature_partition_column_index_offsets(),
+          num_data_);
+      }
     } else {
-      CUDAConstructHistogramDenseKernel_GlobalMemory<BIN_TYPE><<<grid_dim, block_dim, 0, cuda_stream_>>>(
-        cuda_smaller_leaf_splits,
-        cuda_gradients_, cuda_hessians_,
-        cuda_row_data_->GetBin<BIN_TYPE>(),
-        cuda_row_data_->cuda_column_hist_offsets(),
-        cuda_row_data_->cuda_partition_hist_offsets(),
-        cuda_row_data_->cuda_feature_partition_column_index_offsets(),
-        num_data_,
-        cuda_hist_buffer_);
+      if (cuda_row_data_->is_sparse()) {
+        CUDAConstructHistogramSparseKernel_GlobalMemory<BIN_TYPE, HIST_TYPE, PTR_TYPE><<<grid_dim, block_dim, 0, cuda_stream_>>>(
+          cuda_smaller_leaf_splits,
+          cuda_gradients_, cuda_hessians_,
+          cuda_row_data_->GetBin<BIN_TYPE>(),
+          cuda_row_data_->GetRowPtr<PTR_TYPE>(),
+          cuda_row_data_->GetPartitionPtr<PTR_TYPE>(),
+          cuda_row_data_->cuda_partition_hist_offsets(),
+          num_data_,
+          reinterpret_cast<HIST_TYPE*>(cuda_hist_buffer_.RawData()));
+      } else {
+        CUDAConstructHistogramDenseKernel_GlobalMemory<BIN_TYPE, HIST_TYPE><<<grid_dim, block_dim, 0, cuda_stream_>>>(
+          cuda_smaller_leaf_splits,
+          cuda_gradients_, cuda_hessians_,
+          cuda_row_data_->GetBin<BIN_TYPE>(),
+          cuda_row_data_->cuda_column_hist_offsets(),
+          cuda_row_data_->cuda_partition_hist_offsets(),
+          cuda_row_data_->cuda_feature_partition_column_index_offsets(),
+          num_data_,
+          reinterpret_cast<HIST_TYPE*>(cuda_hist_buffer_.RawData()));
+      }
     }
   }
 }
@@ -403,28 +764,195 @@ __global__ void FixHistogramKernel(
   }
 }
 
+template <bool SMALLER_USE_16BIT_HIST, bool LARGER_USE_16BIT_HIST, bool PARENT_USE_16BIT_HIST>
+__global__ void SubtractHistogramDiscretizedKernel(
+  const int num_total_bin,
+  const CUDALeafSplitsStruct* cuda_smaller_leaf_splits,
+  const CUDALeafSplitsStruct* cuda_larger_leaf_splits,
+  hist_t* num_bit_change_buffer) {
+  const unsigned int global_thread_index = threadIdx.x + blockIdx.x * blockDim.x;
+  const int cuda_larger_leaf_index_ref = cuda_larger_leaf_splits->leaf_index;
+  if (cuda_larger_leaf_index_ref >= 0) {
+    if (PARENT_USE_16BIT_HIST) {
+      const int32_t* smaller_leaf_hist = reinterpret_cast<const int32_t*>(cuda_smaller_leaf_splits->hist_in_leaf);
+      int32_t* larger_leaf_hist = reinterpret_cast<int32_t*>(cuda_larger_leaf_splits->hist_in_leaf);
+      if (global_thread_index < num_total_bin) {
+        larger_leaf_hist[global_thread_index] -= smaller_leaf_hist[global_thread_index];
+      }
+    } else if (LARGER_USE_16BIT_HIST) {
+      int32_t* buffer = reinterpret_cast<int32_t*>(num_bit_change_buffer);
+      const int32_t* smaller_leaf_hist = reinterpret_cast<const int32_t*>(cuda_smaller_leaf_splits->hist_in_leaf);
+      int64_t* larger_leaf_hist = reinterpret_cast<int64_t*>(cuda_larger_leaf_splits->hist_in_leaf);
+      if (global_thread_index < num_total_bin) {
+        const int64_t parent_hist_item = larger_leaf_hist[global_thread_index];
+        const int32_t smaller_hist_item = smaller_leaf_hist[global_thread_index];
+        const int64_t smaller_hist_item_int64 = (static_cast<int64_t>(static_cast<int16_t>(smaller_hist_item >> 16)) << 32) |
+          static_cast<int64_t>(smaller_hist_item & 0x0000ffff);
+        const int64_t larger_hist_item = parent_hist_item - smaller_hist_item_int64;
+        buffer[global_thread_index] = static_cast<int32_t>(static_cast<int16_t>(larger_hist_item >> 32) << 16) |
+          static_cast<int32_t>(larger_hist_item & 0x000000000000ffff);
+      }
+    } else if (SMALLER_USE_16BIT_HIST) {
+        const int32_t* smaller_leaf_hist = reinterpret_cast<const int32_t*>(cuda_smaller_leaf_splits->hist_in_leaf);
+        int64_t* larger_leaf_hist = reinterpret_cast<int64_t*>(cuda_larger_leaf_splits->hist_in_leaf);
+        if (global_thread_index < num_total_bin) {
+          const int64_t parent_hist_item = larger_leaf_hist[global_thread_index];
+          const int32_t smaller_hist_item = smaller_leaf_hist[global_thread_index];
+          const int64_t smaller_hist_item_int64 = (static_cast<int64_t>(static_cast<int16_t>(smaller_hist_item >> 16)) << 32) |
+            static_cast<int64_t>(smaller_hist_item & 0x0000ffff);
+          const int64_t larger_hist_item = parent_hist_item - smaller_hist_item_int64;
+          larger_leaf_hist[global_thread_index] = larger_hist_item;
+        }
+    } else {
+      const int64_t* smaller_leaf_hist = reinterpret_cast<const int64_t*>(cuda_smaller_leaf_splits->hist_in_leaf);
+      int64_t* larger_leaf_hist = reinterpret_cast<int64_t*>(cuda_larger_leaf_splits->hist_in_leaf);
+      if (global_thread_index < num_total_bin) {
+        larger_leaf_hist[global_thread_index] -= smaller_leaf_hist[global_thread_index];
+      }
+    }
+  }
+}
+
+__global__ void CopyChangedNumBitHistogram(
+  const int num_total_bin,
+  const CUDALeafSplitsStruct* cuda_larger_leaf_splits,
+  hist_t* num_bit_change_buffer) {
+  int32_t* hist_dst = reinterpret_cast<int32_t*>(cuda_larger_leaf_splits->hist_in_leaf);
+  const int32_t* hist_src = reinterpret_cast<const int32_t*>(num_bit_change_buffer);
+  const unsigned int global_thread_index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (global_thread_index < static_cast<unsigned int>(num_total_bin)) {
+    hist_dst[global_thread_index] = hist_src[global_thread_index];
+  }
+}
+
+template <bool USE_16BIT_HIST>
+__global__ void FixHistogramDiscretizedKernel(
+  const uint32_t* cuda_feature_num_bins,
+  const uint32_t* cuda_feature_hist_offsets,
+  const uint32_t* cuda_feature_most_freq_bins,
+  const int* cuda_need_fix_histogram_features,
+  const uint32_t* cuda_need_fix_histogram_features_num_bin_aligned,
+  const CUDALeafSplitsStruct* cuda_smaller_leaf_splits) {
+  __shared__ int64_t shared_mem_buffer[32];
+  const unsigned int blockIdx_x = blockIdx.x;
+  const int feature_index = cuda_need_fix_histogram_features[blockIdx_x];
+  const uint32_t num_bin_aligned = cuda_need_fix_histogram_features_num_bin_aligned[blockIdx_x];
+  const uint32_t feature_hist_offset = cuda_feature_hist_offsets[feature_index];
+  const uint32_t most_freq_bin = cuda_feature_most_freq_bins[feature_index];
+  if (USE_16BIT_HIST) {
+    const int64_t leaf_sum_gradients_hessians_int64 = cuda_smaller_leaf_splits->sum_of_gradients_hessians;
+    const int32_t leaf_sum_gradients_hessians =
+      (static_cast<int32_t>(leaf_sum_gradients_hessians_int64 >> 32) << 16) | static_cast<int32_t>(leaf_sum_gradients_hessians_int64 & 0x000000000000ffff);
+    int32_t* feature_hist = reinterpret_cast<int32_t*>(cuda_smaller_leaf_splits->hist_in_leaf) + feature_hist_offset;
+    const unsigned int threadIdx_x = threadIdx.x;
+    const uint32_t num_bin = cuda_feature_num_bins[feature_index];
+    const int32_t bin_gradient_hessian = (threadIdx_x < num_bin && threadIdx_x != most_freq_bin) ? feature_hist[threadIdx_x] : 0;
+    const int32_t sum_gradient_hessian = ShuffleReduceSum<int32_t>(
+      bin_gradient_hessian,
+      reinterpret_cast<int32_t*>(shared_mem_buffer),
+      num_bin_aligned);
+    if (threadIdx_x == 0) {
+      feature_hist[most_freq_bin] = leaf_sum_gradients_hessians - sum_gradient_hessian;
+    }
+  } else {
+    const int64_t leaf_sum_gradients_hessians = cuda_smaller_leaf_splits->sum_of_gradients_hessians;
+    int64_t* feature_hist = reinterpret_cast<int64_t*>(cuda_smaller_leaf_splits->hist_in_leaf) + feature_hist_offset;
+    const unsigned int threadIdx_x = threadIdx.x;
+    const uint32_t num_bin = cuda_feature_num_bins[feature_index];
+    const int64_t bin_gradient_hessian = (threadIdx_x < num_bin && threadIdx_x != most_freq_bin) ? feature_hist[threadIdx_x] : 0;
+    const int64_t sum_gradient_hessian = ShuffleReduceSum<int64_t>(bin_gradient_hessian, shared_mem_buffer, num_bin_aligned);
+    if (threadIdx_x == 0) {
+      feature_hist[most_freq_bin] = leaf_sum_gradients_hessians - sum_gradient_hessian;
+    }
+  }
+}
+
 void CUDAHistogramConstructor::LaunchSubtractHistogramKernel(
   const CUDALeafSplitsStruct* cuda_smaller_leaf_splits,
-  const CUDALeafSplitsStruct* cuda_larger_leaf_splits) {
-  const int num_subtract_threads = 2 * num_total_bin_;
-  const int num_subtract_blocks = (num_subtract_threads + SUBTRACT_BLOCK_SIZE - 1) / SUBTRACT_BLOCK_SIZE;
-  global_timer.Start("CUDAHistogramConstructor::FixHistogramKernel");
-  if (need_fix_histogram_features_.size() > 0) {
-    FixHistogramKernel<<<need_fix_histogram_features_.size(), FIX_HISTOGRAM_BLOCK_SIZE, 0, cuda_stream_>>>(
-      cuda_feature_num_bins_,
-      cuda_feature_hist_offsets_,
-      cuda_feature_most_freq_bins_,
-      cuda_need_fix_histogram_features_,
-      cuda_need_fix_histogram_features_num_bin_aligned_,
-      cuda_smaller_leaf_splits);
-  }
-  global_timer.Stop("CUDAHistogramConstructor::FixHistogramKernel");
-  global_timer.Start("CUDAHistogramConstructor::SubtractHistogramKernel");
-  SubtractHistogramKernel<<<num_subtract_blocks, SUBTRACT_BLOCK_SIZE, 0, cuda_stream_>>>(
-    num_total_bin_,
-    cuda_smaller_leaf_splits,
-    cuda_larger_leaf_splits);
-  global_timer.Stop("CUDAHistogramConstructor::SubtractHistogramKernel");
+  const CUDALeafSplitsStruct* cuda_larger_leaf_splits,
+  const bool use_discretized_grad,
+  const uint8_t parent_num_bits_in_histogram_bins,
+  const uint8_t smaller_num_bits_in_histogram_bins,
+  const uint8_t larger_num_bits_in_histogram_bins) {
+    if (!use_discretized_grad) {
+      const int num_subtract_threads = 2 * num_total_bin_;
+      const int num_subtract_blocks = (num_subtract_threads + SUBTRACT_BLOCK_SIZE - 1) / SUBTRACT_BLOCK_SIZE;
+      global_timer.Start("CUDAHistogramConstructor::FixHistogramKernel");
+      if (need_fix_histogram_features_.size() > 0) {
+        FixHistogramKernel<<<need_fix_histogram_features_.size(), FIX_HISTOGRAM_BLOCK_SIZE, 0, cuda_stream_>>>(
+          cuda_feature_num_bins_.RawData(),
+          cuda_feature_hist_offsets_.RawData(),
+          cuda_feature_most_freq_bins_.RawData(),
+          cuda_need_fix_histogram_features_.RawData(),
+          cuda_need_fix_histogram_features_num_bin_aligned_.RawData(),
+          cuda_smaller_leaf_splits);
+      }
+      global_timer.Stop("CUDAHistogramConstructor::FixHistogramKernel");
+      global_timer.Start("CUDAHistogramConstructor::SubtractHistogramKernel");
+      SubtractHistogramKernel<<<num_subtract_blocks, SUBTRACT_BLOCK_SIZE, 0, cuda_stream_>>>(
+        num_total_bin_,
+        cuda_smaller_leaf_splits,
+        cuda_larger_leaf_splits);
+      global_timer.Stop("CUDAHistogramConstructor::SubtractHistogramKernel");
+    } else {
+      const int num_subtract_threads = num_total_bin_;
+      const int num_subtract_blocks = (num_subtract_threads + SUBTRACT_BLOCK_SIZE - 1) / SUBTRACT_BLOCK_SIZE;
+      global_timer.Start("CUDAHistogramConstructor::FixHistogramDiscretizedKernel");
+      if (need_fix_histogram_features_.size() > 0) {
+        if (smaller_num_bits_in_histogram_bins <= 16) {
+          FixHistogramDiscretizedKernel<true><<<need_fix_histogram_features_.size(), FIX_HISTOGRAM_BLOCK_SIZE, 0, cuda_stream_>>>(
+            cuda_feature_num_bins_.RawData(),
+            cuda_feature_hist_offsets_.RawData(),
+            cuda_feature_most_freq_bins_.RawData(),
+            cuda_need_fix_histogram_features_.RawData(),
+            cuda_need_fix_histogram_features_num_bin_aligned_.RawData(),
+            cuda_smaller_leaf_splits);
+        } else {
+          FixHistogramDiscretizedKernel<false><<<need_fix_histogram_features_.size(), FIX_HISTOGRAM_BLOCK_SIZE, 0, cuda_stream_>>>(
+            cuda_feature_num_bins_.RawData(),
+            cuda_feature_hist_offsets_.RawData(),
+            cuda_feature_most_freq_bins_.RawData(),
+            cuda_need_fix_histogram_features_.RawData(),
+            cuda_need_fix_histogram_features_num_bin_aligned_.RawData(),
+            cuda_smaller_leaf_splits);
+        }
+      }
+      global_timer.Stop("CUDAHistogramConstructor::FixHistogramDiscretizedKernel");
+      global_timer.Start("CUDAHistogramConstructor::SubtractHistogramDiscretizedKernel");
+      if (parent_num_bits_in_histogram_bins <= 16) {
+        CHECK_LE(smaller_num_bits_in_histogram_bins, 16);
+        CHECK_LE(larger_num_bits_in_histogram_bins, 16);
+        SubtractHistogramDiscretizedKernel<true, true, true><<<num_subtract_blocks, SUBTRACT_BLOCK_SIZE, 0, cuda_stream_>>>(
+          num_total_bin_,
+          cuda_smaller_leaf_splits,
+          cuda_larger_leaf_splits,
+          hist_buffer_for_num_bit_change_.RawData());
+      } else if (larger_num_bits_in_histogram_bins <= 16) {
+        CHECK_LE(smaller_num_bits_in_histogram_bins, 16);
+        SubtractHistogramDiscretizedKernel<true, true, false><<<num_subtract_blocks, SUBTRACT_BLOCK_SIZE, 0, cuda_stream_>>>(
+          num_total_bin_,
+          cuda_smaller_leaf_splits,
+          cuda_larger_leaf_splits,
+          hist_buffer_for_num_bit_change_.RawData());
+        CopyChangedNumBitHistogram<<<num_subtract_blocks, SUBTRACT_BLOCK_SIZE, 0, cuda_stream_>>>(
+          num_total_bin_,
+          cuda_larger_leaf_splits,
+          hist_buffer_for_num_bit_change_.RawData());
+      } else if (smaller_num_bits_in_histogram_bins <= 16) {
+        SubtractHistogramDiscretizedKernel<true, false, false><<<num_subtract_blocks, SUBTRACT_BLOCK_SIZE, 0, cuda_stream_>>>(
+          num_total_bin_,
+          cuda_smaller_leaf_splits,
+          cuda_larger_leaf_splits,
+          hist_buffer_for_num_bit_change_.RawData());
+      } else {
+        SubtractHistogramDiscretizedKernel<false, false, false><<<num_subtract_blocks, SUBTRACT_BLOCK_SIZE, 0, cuda_stream_>>>(
+          num_total_bin_,
+          cuda_smaller_leaf_splits,
+          cuda_larger_leaf_splits,
+          hist_buffer_for_num_bit_change_.RawData());
+      }
+      global_timer.Stop("CUDAHistogramConstructor::SubtractHistogramDiscretizedKernel");
+    }
 }
 
 }  // namespace LightGBM
diff --git a/src/treelearner/cuda/cuda_histogram_constructor.hpp b/src/treelearner/cuda/cuda_histogram_constructor.hpp
index 7e600e7c01b4..ddc78cb17d90 100644
--- a/src/treelearner/cuda/cuda_histogram_constructor.hpp
+++ b/src/treelearner/cuda/cuda_histogram_constructor.hpp
@@ -9,6 +9,7 @@
 #ifdef USE_CUDA
 
 #include <LightGBM/cuda/cuda_row_data.hpp>
+#include <LightGBM/cuda/cuda_utils.hu>
 #include <LightGBM/feature_group.h>
 #include <LightGBM/tree.h>
 
@@ -37,7 +38,9 @@ class CUDAHistogramConstructor {
     const int min_data_in_leaf,
     const double min_sum_hessian_in_leaf,
     const int gpu_device_id,
-    const bool gpu_use_dp);
+    const bool gpu_use_dp,
+    const bool use_discretized_grad,
+    const int grad_discretized_bins);
 
   ~CUDAHistogramConstructor();
 
@@ -49,7 +52,16 @@ class CUDAHistogramConstructor {
     const data_size_t num_data_in_smaller_leaf,
     const data_size_t num_data_in_larger_leaf,
     const double sum_hessians_in_smaller_leaf,
-    const double sum_hessians_in_larger_leaf);
+    const double sum_hessians_in_larger_leaf,
+    const uint8_t num_bits_in_histogram_bins);
+
+  void SubtractHistogramForLeaf(
+    const CUDALeafSplitsStruct* cuda_smaller_leaf_splits,
+    const CUDALeafSplitsStruct* cuda_larger_leaf_splits,
+    const bool use_discretized_grad,
+    const uint8_t parent_num_bits_in_histogram_bins,
+    const uint8_t smaller_num_bits_in_histogram_bins,
+    const uint8_t larger_num_bits_in_histogram_bins);
 
   void ResetTrainingData(const Dataset* train_data, TrainingShareStates* share_states);
 
@@ -57,9 +69,9 @@ class CUDAHistogramConstructor {
 
   void BeforeTrain(const score_t* gradients, const score_t* hessians);
 
-  const hist_t* cuda_hist() const { return cuda_hist_; }
+  const hist_t* cuda_hist() const { return cuda_hist_.RawData(); }
 
-  hist_t* cuda_hist_pointer() { return cuda_hist_; }
+  hist_t* cuda_hist_pointer() { return cuda_hist_.RawData(); }
 
  private:
   void InitFeatureMetaInfo(const Dataset* train_data, const std::vector<uint32_t>& feature_hist_offsets);
@@ -74,30 +86,39 @@ class CUDAHistogramConstructor {
   template <typename HIST_TYPE, size_t SHARED_HIST_SIZE>
   void LaunchConstructHistogramKernelInner(
     const CUDALeafSplitsStruct* cuda_smaller_leaf_splits,
-    const data_size_t num_data_in_smaller_leaf);
+    const data_size_t num_data_in_smaller_leaf,
+    const uint8_t num_bits_in_histogram_bins);
 
   template <typename HIST_TYPE, size_t SHARED_HIST_SIZE, typename BIN_TYPE>
   void LaunchConstructHistogramKernelInner0(
     const CUDALeafSplitsStruct* cuda_smaller_leaf_splits,
-    const data_size_t num_data_in_smaller_leaf);
+    const data_size_t num_data_in_smaller_leaf,
+    const uint8_t num_bits_in_histogram_bins);
 
   template <typename HIST_TYPE, size_t SHARED_HIST_SIZE, typename BIN_TYPE, typename PTR_TYPE>
   void LaunchConstructHistogramKernelInner1(
     const CUDALeafSplitsStruct* cuda_smaller_leaf_splits,
-    const data_size_t num_data_in_smaller_leaf);
+    const data_size_t num_data_in_smaller_leaf,
+    const uint8_t num_bits_in_histogram_bins);
 
   template <typename HIST_TYPE, size_t SHARED_HIST_SIZE, typename BIN_TYPE, typename PTR_TYPE, bool USE_GLOBAL_MEM_BUFFER>
   void LaunchConstructHistogramKernelInner2(
     const CUDALeafSplitsStruct* cuda_smaller_leaf_splits,
-    const data_size_t num_data_in_smaller_leaf);
+    const data_size_t num_data_in_smaller_leaf,
+    const uint8_t num_bits_in_histogram_bins);
 
   void LaunchConstructHistogramKernel(
     const CUDALeafSplitsStruct* cuda_smaller_leaf_splits,
-    const data_size_t num_data_in_smaller_leaf);
+    const data_size_t num_data_in_smaller_leaf,
+    const uint8_t num_bits_in_histogram_bins);
 
   void LaunchSubtractHistogramKernel(
     const CUDALeafSplitsStruct* cuda_smaller_leaf_splits,
-    const CUDALeafSplitsStruct* cuda_larger_leaf_splits);
+    const CUDALeafSplitsStruct* cuda_larger_leaf_splits,
+    const bool use_discretized_grad,
+    const uint8_t parent_num_bits_in_histogram_bins,
+    const uint8_t smaller_num_bits_in_histogram_bins,
+    const uint8_t larger_num_bits_in_histogram_bins);
 
   // Host memory
 
@@ -136,19 +157,21 @@ class CUDAHistogramConstructor {
   /*! \brief CUDA row wise data */
   std::unique_ptr<CUDARowData> cuda_row_data_;
   /*! \brief number of bins per feature */
-  uint32_t* cuda_feature_num_bins_;
+  CUDAVector<uint32_t> cuda_feature_num_bins_;
   /*! \brief offsets in histogram of all features */
-  uint32_t* cuda_feature_hist_offsets_;
+  CUDAVector<uint32_t> cuda_feature_hist_offsets_;
   /*! \brief most frequent bins in each feature */
-  uint32_t* cuda_feature_most_freq_bins_;
+  CUDAVector<uint32_t> cuda_feature_most_freq_bins_;
   /*! \brief CUDA histograms */
-  hist_t* cuda_hist_;
+  CUDAVector<hist_t> cuda_hist_;
   /*! \brief CUDA histograms buffer for each block */
-  float* cuda_hist_buffer_;
+  CUDAVector<float> cuda_hist_buffer_;
   /*! \brief indices of feature whose histograms need to be fixed */
-  int* cuda_need_fix_histogram_features_;
+  CUDAVector<int> cuda_need_fix_histogram_features_;
   /*! \brief aligned number of bins of the features whose histograms need to be fixed */
-  uint32_t* cuda_need_fix_histogram_features_num_bin_aligned_;
+  CUDAVector<uint32_t> cuda_need_fix_histogram_features_num_bin_aligned_;
+  /*! \brief histogram buffer used in histogram subtraction with different number of bits for histogram bins */
+  CUDAVector<hist_t> hist_buffer_for_num_bit_change_;
 
   // CUDA memory, held by other object
 
@@ -161,6 +184,10 @@ class CUDAHistogramConstructor {
   const int gpu_device_id_;
   /*! \brief use double precision histogram per block */
   const bool gpu_use_dp_;
+  /*! \brief whether to use quantized gradients */
+  const bool use_quantized_grad_;
+  /*! \brief the number of bins to quantized gradients */
+  const int num_grad_quant_bins_;
 };
 
 }  // namespace LightGBM
diff --git a/src/treelearner/cuda/cuda_leaf_splits.cpp b/src/treelearner/cuda/cuda_leaf_splits.cpp
index 6aa020d9ea0d..57b5b777c142 100644
--- a/src/treelearner/cuda/cuda_leaf_splits.cpp
+++ b/src/treelearner/cuda/cuda_leaf_splits.cpp
@@ -11,27 +11,22 @@
 namespace LightGBM {
 
 CUDALeafSplits::CUDALeafSplits(const data_size_t num_data):
-num_data_(num_data) {
-  cuda_struct_ = nullptr;
-  cuda_sum_of_gradients_buffer_ = nullptr;
-  cuda_sum_of_hessians_buffer_ = nullptr;
-}
+num_data_(num_data) {}
 
-CUDALeafSplits::~CUDALeafSplits() {
-  DeallocateCUDAMemory<CUDALeafSplitsStruct>(&cuda_struct_, __FILE__, __LINE__);
-  DeallocateCUDAMemory<double>(&cuda_sum_of_gradients_buffer_, __FILE__, __LINE__);
-  DeallocateCUDAMemory<double>(&cuda_sum_of_hessians_buffer_, __FILE__, __LINE__);
-}
+CUDALeafSplits::~CUDALeafSplits() {}
 
-void CUDALeafSplits::Init() {
+void CUDALeafSplits::Init(const bool use_quantized_grad) {
   num_blocks_init_from_gradients_ = (num_data_ + NUM_THRADS_PER_BLOCK_LEAF_SPLITS - 1) / NUM_THRADS_PER_BLOCK_LEAF_SPLITS;
 
   // allocate more memory for sum reduction in CUDA
   // only the first element records the final sum
-  AllocateCUDAMemory<double>(&cuda_sum_of_gradients_buffer_, num_blocks_init_from_gradients_, __FILE__, __LINE__);
-  AllocateCUDAMemory<double>(&cuda_sum_of_hessians_buffer_, num_blocks_init_from_gradients_, __FILE__, __LINE__);
+  cuda_sum_of_gradients_buffer_.Resize(static_cast<size_t>(num_blocks_init_from_gradients_));
+  cuda_sum_of_hessians_buffer_.Resize(static_cast<size_t>(num_blocks_init_from_gradients_));
+  if (use_quantized_grad) {
+    cuda_sum_of_gradients_hessians_buffer_.Resize(static_cast<size_t>(num_blocks_init_from_gradients_));
+  }
 
-  AllocateCUDAMemory<CUDALeafSplitsStruct>(&cuda_struct_, 1, __FILE__, __LINE__);
+  cuda_struct_.Resize(1);
 }
 
 void CUDALeafSplits::InitValues() {
@@ -46,24 +41,33 @@ void CUDALeafSplits::InitValues(
   const data_size_t num_used_indices, hist_t* cuda_hist_in_leaf, double* root_sum_hessians) {
   cuda_gradients_ = cuda_gradients;
   cuda_hessians_ = cuda_hessians;
-  SetCUDAMemory<double>(cuda_sum_of_gradients_buffer_, 0, num_blocks_init_from_gradients_, __FILE__, __LINE__);
-  SetCUDAMemory<double>(cuda_sum_of_hessians_buffer_, 0, num_blocks_init_from_gradients_, __FILE__, __LINE__);
+  cuda_sum_of_gradients_buffer_.SetValue(0);
+  cuda_sum_of_hessians_buffer_.SetValue(0);
   LaunchInitValuesKernal(lambda_l1, lambda_l2, cuda_bagging_data_indices, cuda_data_indices_in_leaf, num_used_indices, cuda_hist_in_leaf);
-  CopyFromCUDADeviceToHost<double>(root_sum_hessians, cuda_sum_of_hessians_buffer_, 1, __FILE__, __LINE__);
+  CopyFromCUDADeviceToHost<double>(root_sum_hessians, cuda_sum_of_hessians_buffer_.RawData(), 1, __FILE__, __LINE__);
+  SynchronizeCUDADevice(__FILE__, __LINE__);
+}
+
+void CUDALeafSplits::InitValues(
+  const double lambda_l1, const double lambda_l2,
+  const int16_t* cuda_gradients_and_hessians,
+  const data_size_t* cuda_bagging_data_indices,
+  const data_size_t* cuda_data_indices_in_leaf, const data_size_t num_used_indices,
+  hist_t* cuda_hist_in_leaf, double* root_sum_hessians,
+  const score_t* grad_scale, const score_t* hess_scale) {
+  cuda_gradients_ = reinterpret_cast<const score_t*>(cuda_gradients_and_hessians);
+  cuda_hessians_ = nullptr;
+  LaunchInitValuesKernal(lambda_l1, lambda_l2, cuda_bagging_data_indices, cuda_data_indices_in_leaf, num_used_indices, cuda_hist_in_leaf, grad_scale, hess_scale);
+  CopyFromCUDADeviceToHost<double>(root_sum_hessians, cuda_sum_of_hessians_buffer_.RawData(), 1, __FILE__, __LINE__);
   SynchronizeCUDADevice(__FILE__, __LINE__);
 }
 
 void CUDALeafSplits::Resize(const data_size_t num_data) {
-  if (num_data > num_data_) {
-    DeallocateCUDAMemory<double>(&cuda_sum_of_gradients_buffer_, __FILE__, __LINE__);
-    DeallocateCUDAMemory<double>(&cuda_sum_of_hessians_buffer_, __FILE__, __LINE__);
-    num_blocks_init_from_gradients_ = (num_data + NUM_THRADS_PER_BLOCK_LEAF_SPLITS - 1) / NUM_THRADS_PER_BLOCK_LEAF_SPLITS;
-    AllocateCUDAMemory<double>(&cuda_sum_of_gradients_buffer_, num_blocks_init_from_gradients_, __FILE__, __LINE__);
-    AllocateCUDAMemory<double>(&cuda_sum_of_hessians_buffer_, num_blocks_init_from_gradients_, __FILE__, __LINE__);
-  } else {
-    num_blocks_init_from_gradients_ = (num_data + NUM_THRADS_PER_BLOCK_LEAF_SPLITS - 1) / NUM_THRADS_PER_BLOCK_LEAF_SPLITS;
-  }
   num_data_ = num_data;
+  num_blocks_init_from_gradients_ = (num_data + NUM_THRADS_PER_BLOCK_LEAF_SPLITS - 1) / NUM_THRADS_PER_BLOCK_LEAF_SPLITS;
+  cuda_sum_of_gradients_buffer_.Resize(static_cast<size_t>(num_blocks_init_from_gradients_));
+  cuda_sum_of_hessians_buffer_.Resize(static_cast<size_t>(num_blocks_init_from_gradients_));
+  cuda_sum_of_gradients_hessians_buffer_.Resize(static_cast<size_t>(num_blocks_init_from_gradients_));
 }
 
 }  // namespace LightGBM
diff --git a/src/treelearner/cuda/cuda_leaf_splits.cu b/src/treelearner/cuda/cuda_leaf_splits.cu
index 29e42f67ead9..ae505ecd55dd 100644
--- a/src/treelearner/cuda/cuda_leaf_splits.cu
+++ b/src/treelearner/cuda/cuda_leaf_splits.cu
@@ -81,6 +81,90 @@ __global__ void CUDAInitValuesKernel2(
   }
 }
 
+template <bool USE_INDICES>
+__global__ void CUDAInitValuesKernel3(const int16_t* cuda_gradients_and_hessians,
+  const data_size_t num_data, const data_size_t* cuda_bagging_data_indices,
+  double* cuda_sum_of_gradients, double* cuda_sum_of_hessians, int64_t* cuda_sum_of_hessians_hessians,
+  const score_t* grad_scale_pointer, const score_t* hess_scale_pointer) {
+  const score_t grad_scale = *grad_scale_pointer;
+  const score_t hess_scale = *hess_scale_pointer;
+  __shared__ int64_t shared_mem_buffer[32];
+  const data_size_t data_index = static_cast<data_size_t>(threadIdx.x + blockIdx.x * blockDim.x);
+  int64_t int_gradient = 0;
+  int64_t int_hessian = 0;
+  if (data_index < num_data) {
+    int_gradient = USE_INDICES ? cuda_gradients_and_hessians[2 * cuda_bagging_data_indices[data_index] + 1] :
+      cuda_gradients_and_hessians[2 * data_index + 1];
+    int_hessian = USE_INDICES ? cuda_gradients_and_hessians[2 * cuda_bagging_data_indices[data_index]] :
+      cuda_gradients_and_hessians[2 * data_index];
+  }
+  const int64_t block_sum_gradient = ShuffleReduceSum<int64_t>(int_gradient, shared_mem_buffer, blockDim.x);
+  __syncthreads();
+  const int64_t block_sum_hessian = ShuffleReduceSum<int64_t>(int_hessian, shared_mem_buffer, blockDim.x);
+  if (threadIdx.x == 0) {
+    cuda_sum_of_gradients[blockIdx.x] = block_sum_gradient * grad_scale;
+    cuda_sum_of_hessians[blockIdx.x] = block_sum_hessian * hess_scale;
+    cuda_sum_of_hessians_hessians[blockIdx.x] = ((block_sum_gradient << 32) | block_sum_hessian);
+  }
+}
+
+__global__ void CUDAInitValuesKernel4(
+  const double lambda_l1,
+  const double lambda_l2,
+  const int num_blocks_to_reduce,
+  double* cuda_sum_of_gradients,
+  double* cuda_sum_of_hessians,
+  int64_t* cuda_sum_of_gradients_hessians,
+  const data_size_t num_data,
+  const data_size_t* cuda_data_indices_in_leaf,
+  hist_t* cuda_hist_in_leaf,
+  CUDALeafSplitsStruct* cuda_struct) {
+  __shared__ double shared_mem_buffer[32];
+  double thread_sum_of_gradients = 0.0f;
+  double thread_sum_of_hessians = 0.0f;
+  int64_t thread_sum_of_gradients_hessians = 0;
+  for (int block_index = static_cast<int>(threadIdx.x); block_index < num_blocks_to_reduce; block_index += static_cast<int>(blockDim.x)) {
+    thread_sum_of_gradients += cuda_sum_of_gradients[block_index];
+    thread_sum_of_hessians += cuda_sum_of_hessians[block_index];
+    thread_sum_of_gradients_hessians += cuda_sum_of_gradients_hessians[block_index];
+  }
+  const double sum_of_gradients = ShuffleReduceSum<double>(thread_sum_of_gradients, shared_mem_buffer, blockDim.x);
+  __syncthreads();
+  const double sum_of_hessians = ShuffleReduceSum<double>(thread_sum_of_hessians, shared_mem_buffer, blockDim.x);
+  __syncthreads();
+  const double sum_of_gradients_hessians = ShuffleReduceSum<int64_t>(
+    thread_sum_of_gradients_hessians,
+    reinterpret_cast<int64_t*>(shared_mem_buffer),
+    blockDim.x);
+  if (threadIdx.x == 0) {
+    cuda_sum_of_hessians[0] = sum_of_hessians;
+    cuda_struct->leaf_index = 0;
+    cuda_struct->sum_of_gradients = sum_of_gradients;
+    cuda_struct->sum_of_hessians = sum_of_hessians;
+    cuda_struct->sum_of_gradients_hessians = sum_of_gradients_hessians;
+    cuda_struct->num_data_in_leaf = num_data;
+    const bool use_l1 = lambda_l1 > 0.0f;
+    if (!use_l1) {
+      // no smoothing on root node
+      cuda_struct->gain = CUDALeafSplits::GetLeafGain<false, false>(sum_of_gradients, sum_of_hessians, lambda_l1, lambda_l2, 0.0f, 0, 0.0f);
+    } else {
+      // no smoothing on root node
+      cuda_struct->gain = CUDALeafSplits::GetLeafGain<true, false>(sum_of_gradients, sum_of_hessians, lambda_l1, lambda_l2, 0.0f, 0, 0.0f);
+    }
+    if (!use_l1) {
+      // no smoothing on root node
+      cuda_struct->leaf_value =
+        CUDALeafSplits::CalculateSplittedLeafOutput<false, false>(sum_of_gradients, sum_of_hessians, lambda_l1, lambda_l2, 0.0f, 0, 0.0f);
+    } else {
+      // no smoothing on root node
+      cuda_struct->leaf_value =
+        CUDALeafSplits::CalculateSplittedLeafOutput<true, false>(sum_of_gradients, sum_of_hessians, lambda_l1, lambda_l2, 0.0f, 0, 0.0f);
+    }
+    cuda_struct->data_indices_in_leaf = cuda_data_indices_in_leaf;
+    cuda_struct->hist_in_leaf = cuda_hist_in_leaf;
+  }
+}
+
 __global__ void InitValuesEmptyKernel(CUDALeafSplitsStruct* cuda_struct) {
   cuda_struct->leaf_index = -1;
   cuda_struct->sum_of_gradients = 0.0f;
@@ -93,7 +177,7 @@ __global__ void InitValuesEmptyKernel(CUDALeafSplitsStruct* cuda_struct) {
 }
 
 void CUDALeafSplits::LaunchInitValuesEmptyKernel() {
-  InitValuesEmptyKernel<<<1, 1>>>(cuda_struct_);
+  InitValuesEmptyKernel<<<1, 1>>>(cuda_struct_.RawData());
 }
 
 void CUDALeafSplits::LaunchInitValuesKernal(
@@ -104,23 +188,55 @@ void CUDALeafSplits::LaunchInitValuesKernal(
   hist_t* cuda_hist_in_leaf) {
   if (cuda_bagging_data_indices == nullptr) {
     CUDAInitValuesKernel1<false><<<num_blocks_init_from_gradients_, NUM_THRADS_PER_BLOCK_LEAF_SPLITS>>>(
-      cuda_gradients_, cuda_hessians_, num_used_indices, nullptr, cuda_sum_of_gradients_buffer_,
-      cuda_sum_of_hessians_buffer_);
+      cuda_gradients_, cuda_hessians_, num_used_indices, nullptr, cuda_sum_of_gradients_buffer_.RawData(),
+      cuda_sum_of_hessians_buffer_.RawData());
   } else {
     CUDAInitValuesKernel1<true><<<num_blocks_init_from_gradients_, NUM_THRADS_PER_BLOCK_LEAF_SPLITS>>>(
-      cuda_gradients_, cuda_hessians_, num_used_indices, cuda_bagging_data_indices, cuda_sum_of_gradients_buffer_,
-      cuda_sum_of_hessians_buffer_);
+      cuda_gradients_, cuda_hessians_, num_used_indices, cuda_bagging_data_indices, cuda_sum_of_gradients_buffer_.RawData(),
+      cuda_sum_of_hessians_buffer_.RawData());
   }
   SynchronizeCUDADevice(__FILE__, __LINE__);
   CUDAInitValuesKernel2<<<1, NUM_THRADS_PER_BLOCK_LEAF_SPLITS>>>(
     lambda_l1, lambda_l2,
     num_blocks_init_from_gradients_,
-    cuda_sum_of_gradients_buffer_,
-    cuda_sum_of_hessians_buffer_,
+    cuda_sum_of_gradients_buffer_.RawData(),
+    cuda_sum_of_hessians_buffer_.RawData(),
+    num_used_indices,
+    cuda_data_indices_in_leaf,
+    cuda_hist_in_leaf,
+    cuda_struct_.RawData());
+  SynchronizeCUDADevice(__FILE__, __LINE__);
+}
+
+void CUDALeafSplits::LaunchInitValuesKernal(
+  const double lambda_l1, const double lambda_l2,
+  const data_size_t* cuda_bagging_data_indices,
+  const data_size_t* cuda_data_indices_in_leaf,
+  const data_size_t num_used_indices,
+  hist_t* cuda_hist_in_leaf,
+  const score_t* grad_scale,
+  const score_t* hess_scale) {
+  if (cuda_bagging_data_indices == nullptr) {
+    CUDAInitValuesKernel3<false><<<num_blocks_init_from_gradients_, NUM_THRADS_PER_BLOCK_LEAF_SPLITS>>>(
+      reinterpret_cast<const int16_t*>(cuda_gradients_), num_used_indices, nullptr, cuda_sum_of_gradients_buffer_.RawData(),
+      cuda_sum_of_hessians_buffer_.RawData(), cuda_sum_of_gradients_hessians_buffer_.RawData(), grad_scale, hess_scale);
+  } else {
+    CUDAInitValuesKernel3<true><<<num_blocks_init_from_gradients_, NUM_THRADS_PER_BLOCK_LEAF_SPLITS>>>(
+      reinterpret_cast<const int16_t*>(cuda_gradients_), num_used_indices, cuda_bagging_data_indices, cuda_sum_of_gradients_buffer_.RawData(),
+      cuda_sum_of_hessians_buffer_.RawData(), cuda_sum_of_gradients_hessians_buffer_.RawData(), grad_scale, hess_scale);
+  }
+
+  SynchronizeCUDADevice(__FILE__, __LINE__);
+  CUDAInitValuesKernel4<<<1, NUM_THRADS_PER_BLOCK_LEAF_SPLITS>>>(
+    lambda_l1, lambda_l2,
+    num_blocks_init_from_gradients_,
+    cuda_sum_of_gradients_buffer_.RawData(),
+    cuda_sum_of_hessians_buffer_.RawData(),
+    cuda_sum_of_gradients_hessians_buffer_.RawData(),
     num_used_indices,
     cuda_data_indices_in_leaf,
     cuda_hist_in_leaf,
-    cuda_struct_);
+    cuda_struct_.RawData());
   SynchronizeCUDADevice(__FILE__, __LINE__);
 }
 
diff --git a/src/treelearner/cuda/cuda_leaf_splits.hpp b/src/treelearner/cuda/cuda_leaf_splits.hpp
index 769f956b95c3..33a9ea578a1f 100644
--- a/src/treelearner/cuda/cuda_leaf_splits.hpp
+++ b/src/treelearner/cuda/cuda_leaf_splits.hpp
@@ -8,7 +8,7 @@
 
 #ifdef USE_CUDA
 
-#include <LightGBM/cuda/cuda_utils.h>
+#include <LightGBM/cuda/cuda_utils.hu>
 #include <LightGBM/bin.h>
 #include <LightGBM/utils/log.h>
 #include <LightGBM/meta.h>
@@ -23,6 +23,7 @@ struct CUDALeafSplitsStruct {
   int leaf_index;
   double sum_of_gradients;
   double sum_of_hessians;
+  int64_t sum_of_gradients_hessians;
   data_size_t num_data_in_leaf;
   double gain;
   double leaf_value;
@@ -36,7 +37,7 @@ class CUDALeafSplits {
 
   ~CUDALeafSplits();
 
-  void Init();
+  void Init(const bool use_quantized_grad);
 
   void InitValues(
     const double lambda_l1, const double lambda_l2,
@@ -45,11 +46,19 @@ class CUDALeafSplits {
     const data_size_t* cuda_data_indices_in_leaf, const data_size_t num_used_indices,
     hist_t* cuda_hist_in_leaf, double* root_sum_hessians);
 
+  void InitValues(
+    const double lambda_l1, const double lambda_l2,
+    const int16_t* cuda_gradients_and_hessians,
+    const data_size_t* cuda_bagging_data_indices,
+    const data_size_t* cuda_data_indices_in_leaf, const data_size_t num_used_indices,
+    hist_t* cuda_hist_in_leaf, double* root_sum_hessians,
+    const score_t* grad_scale, const score_t* hess_scale);
+
   void InitValues();
 
-  const CUDALeafSplitsStruct* GetCUDAStruct() const { return cuda_struct_; }
+  const CUDALeafSplitsStruct* GetCUDAStruct() const { return cuda_struct_.RawDataReadOnly(); }
 
-  CUDALeafSplitsStruct* GetCUDAStructRef() { return cuda_struct_; }
+  CUDALeafSplitsStruct* GetCUDAStructRef() { return cuda_struct_.RawData(); }
 
   void Resize(const data_size_t num_data);
 
@@ -140,14 +149,24 @@ class CUDALeafSplits {
     const data_size_t num_used_indices,
     hist_t* cuda_hist_in_leaf);
 
+  void LaunchInitValuesKernal(
+    const double lambda_l1, const double lambda_l2,
+    const data_size_t* cuda_bagging_data_indices,
+    const data_size_t* cuda_data_indices_in_leaf,
+    const data_size_t num_used_indices,
+    hist_t* cuda_hist_in_leaf,
+    const score_t* grad_scale,
+    const score_t* hess_scale);
+
   // Host memory
   data_size_t num_data_;
   int num_blocks_init_from_gradients_;
 
   // CUDA memory, held by this object
-  CUDALeafSplitsStruct* cuda_struct_;
-  double* cuda_sum_of_gradients_buffer_;
-  double* cuda_sum_of_hessians_buffer_;
+  CUDAVector<CUDALeafSplitsStruct> cuda_struct_;
+  CUDAVector<double> cuda_sum_of_gradients_buffer_;
+  CUDAVector<double> cuda_sum_of_hessians_buffer_;
+  CUDAVector<int64_t> cuda_sum_of_gradients_hessians_buffer_;
 
   // CUDA memory, held by other object
   const score_t* cuda_gradients_;
diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp
index 1600f3767c0c..8f8ff15f0715 100644
--- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp
+++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cpp
@@ -9,7 +9,7 @@
 #include "cuda_single_gpu_tree_learner.hpp"
 
 #include <LightGBM/cuda/cuda_tree.hpp>
-#include <LightGBM/cuda/cuda_utils.h>
+#include <LightGBM/cuda/cuda_utils.hu>
 #include <LightGBM/feature_group.h>
 #include <LightGBM/network.h>
 #include <LightGBM/objective_function.h>
@@ -39,13 +39,14 @@ void CUDASingleGPUTreeLearner::Init(const Dataset* train_data, bool is_constant_
   SetCUDADevice(gpu_device_id_, __FILE__, __LINE__);
 
   cuda_smaller_leaf_splits_.reset(new CUDALeafSplits(num_data_));
-  cuda_smaller_leaf_splits_->Init();
+  cuda_smaller_leaf_splits_->Init(config_->use_quantized_grad);
   cuda_larger_leaf_splits_.reset(new CUDALeafSplits(num_data_));
-  cuda_larger_leaf_splits_->Init();
+  cuda_larger_leaf_splits_->Init(config_->use_quantized_grad);
 
   cuda_histogram_constructor_.reset(new CUDAHistogramConstructor(train_data_, config_->num_leaves, num_threads_,
     share_state_->feature_hist_offsets(),
-    config_->min_data_in_leaf, config_->min_sum_hessian_in_leaf, gpu_device_id_, config_->gpu_use_dp));
+    config_->min_data_in_leaf, config_->min_sum_hessian_in_leaf, gpu_device_id_, config_->gpu_use_dp,
+    config_->use_quantized_grad, config_->num_grad_quant_bins));
   cuda_histogram_constructor_->Init(train_data_, share_state_.get());
 
   const auto& feature_hist_offsets = share_state_->feature_hist_offsets();
@@ -73,11 +74,19 @@ void CUDASingleGPUTreeLearner::Init(const Dataset* train_data, bool is_constant_
   }
   AllocateBitset();
 
-  cuda_leaf_gradient_stat_buffer_ = nullptr;
-  cuda_leaf_hessian_stat_buffer_ = nullptr;
   leaf_stat_buffer_size_ = 0;
   num_cat_threshold_ = 0;
 
+  if (config_->use_quantized_grad) {
+    cuda_leaf_gradient_stat_buffer_.Resize(config_->num_leaves);
+    cuda_leaf_hessian_stat_buffer_.Resize(config_->num_leaves);
+    cuda_gradient_discretizer_.reset(new CUDAGradientDiscretizer(
+      config_->num_grad_quant_bins, config_->num_iterations, config_->seed, is_constant_hessian, config_->stochastic_rounding));
+    cuda_gradient_discretizer_->Init(num_data_, config_->num_leaves, train_data_->num_features(), train_data_);
+  } else {
+    cuda_gradient_discretizer_.reset(nullptr);
+  }
+
   #ifdef DEBUG
   host_gradients_.resize(num_data_, 0.0f);
   host_hessians_.resize(num_data_, 0.0f);
@@ -101,19 +110,37 @@ void CUDASingleGPUTreeLearner::BeforeTrain() {
   const data_size_t* leaf_splits_init_indices =
     cuda_data_partition_->use_bagging() ? cuda_data_partition_->cuda_data_indices() : nullptr;
   cuda_data_partition_->BeforeTrain();
-  cuda_smaller_leaf_splits_->InitValues(
-    config_->lambda_l1,
-    config_->lambda_l2,
-    gradients_,
-    hessians_,
-    leaf_splits_init_indices,
-    cuda_data_partition_->cuda_data_indices(),
-    root_num_data,
-    cuda_histogram_constructor_->cuda_hist_pointer(),
-    &leaf_sum_hessians_[0]);
+  if (config_->use_quantized_grad) {
+    cuda_gradient_discretizer_->DiscretizeGradients(num_data_, gradients_, hessians_);
+    cuda_histogram_constructor_->BeforeTrain(
+      reinterpret_cast<const score_t*>(cuda_gradient_discretizer_->discretized_gradients_and_hessians()), nullptr);
+    cuda_smaller_leaf_splits_->InitValues(
+      config_->lambda_l1,
+      config_->lambda_l2,
+      reinterpret_cast<const int16_t*>(cuda_gradient_discretizer_->discretized_gradients_and_hessians()),
+      leaf_splits_init_indices,
+      cuda_data_partition_->cuda_data_indices(),
+      root_num_data,
+      cuda_histogram_constructor_->cuda_hist_pointer(),
+      &leaf_sum_hessians_[0],
+      cuda_gradient_discretizer_->grad_scale_ptr(),
+      cuda_gradient_discretizer_->hess_scale_ptr());
+      cuda_gradient_discretizer_->SetNumBitsInHistogramBin<false>(0, -1, root_num_data, 0);
+  } else {
+    cuda_histogram_constructor_->BeforeTrain(gradients_, hessians_);
+    cuda_smaller_leaf_splits_->InitValues(
+      config_->lambda_l1,
+      config_->lambda_l2,
+      gradients_,
+      hessians_,
+      leaf_splits_init_indices,
+      cuda_data_partition_->cuda_data_indices(),
+      root_num_data,
+      cuda_histogram_constructor_->cuda_hist_pointer(),
+      &leaf_sum_hessians_[0]);
+  }
   leaf_num_data_[0] = root_num_data;
   cuda_larger_leaf_splits_->InitValues();
-  cuda_histogram_constructor_->BeforeTrain(gradients_, hessians_);
   col_sampler_.ResetByTree();
   cuda_best_split_finder_->BeforeTrain(col_sampler_.is_feature_used_bytree());
   leaf_data_start_[0] = 0;
@@ -141,24 +168,70 @@ Tree* CUDASingleGPUTreeLearner::Train(const score_t* gradients,
     const data_size_t num_data_in_larger_leaf = larger_leaf_index_ < 0 ? 0 : leaf_num_data_[larger_leaf_index_];
     const double sum_hessians_in_smaller_leaf = leaf_sum_hessians_[smaller_leaf_index_];
     const double sum_hessians_in_larger_leaf = larger_leaf_index_ < 0 ? 0 : leaf_sum_hessians_[larger_leaf_index_];
+    const uint8_t num_bits_in_histogram_bins = config_->use_quantized_grad ? cuda_gradient_discretizer_->GetHistBitsInLeaf<false>(smaller_leaf_index_) : 0;
     cuda_histogram_constructor_->ConstructHistogramForLeaf(
       cuda_smaller_leaf_splits_->GetCUDAStruct(),
       cuda_larger_leaf_splits_->GetCUDAStruct(),
       num_data_in_smaller_leaf,
       num_data_in_larger_leaf,
       sum_hessians_in_smaller_leaf,
-      sum_hessians_in_larger_leaf);
+      sum_hessians_in_larger_leaf,
+      num_bits_in_histogram_bins);
     global_timer.Stop("CUDASingleGPUTreeLearner::ConstructHistogramForLeaf");
     global_timer.Start("CUDASingleGPUTreeLearner::FindBestSplitsForLeaf");
 
-    SelectFeatureByNode(tree.get());
-
-    cuda_best_split_finder_->FindBestSplitsForLeaf(
+    uint8_t parent_num_bits_bin = 0;
+    uint8_t smaller_num_bits_bin = 0;
+    uint8_t larger_num_bits_bin = 0;
+    if (config_->use_quantized_grad) {
+      if (larger_leaf_index_ != -1) {
+        const int parent_leaf_index = std::min(smaller_leaf_index_, larger_leaf_index_);
+        parent_num_bits_bin = cuda_gradient_discretizer_->GetHistBitsInNode<false>(parent_leaf_index);
+        smaller_num_bits_bin = cuda_gradient_discretizer_->GetHistBitsInLeaf<false>(smaller_leaf_index_);
+        larger_num_bits_bin = cuda_gradient_discretizer_->GetHistBitsInLeaf<false>(larger_leaf_index_);
+      } else {
+        parent_num_bits_bin = cuda_gradient_discretizer_->GetHistBitsInLeaf<false>(0);
+        smaller_num_bits_bin = cuda_gradient_discretizer_->GetHistBitsInLeaf<false>(0);
+        larger_num_bits_bin = cuda_gradient_discretizer_->GetHistBitsInLeaf<false>(0);
+      }
+    } else {
+      parent_num_bits_bin = 0;
+      smaller_num_bits_bin = 0;
+      larger_num_bits_bin = 0;
+    }
+    cuda_histogram_constructor_->SubtractHistogramForLeaf(
       cuda_smaller_leaf_splits_->GetCUDAStruct(),
       cuda_larger_leaf_splits_->GetCUDAStruct(),
-      smaller_leaf_index_, larger_leaf_index_,
-      num_data_in_smaller_leaf, num_data_in_larger_leaf,
-      sum_hessians_in_smaller_leaf, sum_hessians_in_larger_leaf);
+      config_->use_quantized_grad,
+      parent_num_bits_bin,
+      smaller_num_bits_bin,
+      larger_num_bits_bin);
+
+    SelectFeatureByNode(tree.get());
+
+    if (config_->use_quantized_grad) {
+      const uint8_t smaller_leaf_num_bits_bin = cuda_gradient_discretizer_->GetHistBitsInLeaf<false>(smaller_leaf_index_);
+      const uint8_t larger_leaf_num_bits_bin = larger_leaf_index_ < 0 ? 32 : cuda_gradient_discretizer_->GetHistBitsInLeaf<false>(larger_leaf_index_);
+      cuda_best_split_finder_->FindBestSplitsForLeaf(
+        cuda_smaller_leaf_splits_->GetCUDAStruct(),
+        cuda_larger_leaf_splits_->GetCUDAStruct(),
+        smaller_leaf_index_, larger_leaf_index_,
+        num_data_in_smaller_leaf, num_data_in_larger_leaf,
+        sum_hessians_in_smaller_leaf, sum_hessians_in_larger_leaf,
+        cuda_gradient_discretizer_->grad_scale_ptr(),
+        cuda_gradient_discretizer_->hess_scale_ptr(),
+        smaller_leaf_num_bits_bin,
+        larger_leaf_num_bits_bin);
+    } else {
+      cuda_best_split_finder_->FindBestSplitsForLeaf(
+        cuda_smaller_leaf_splits_->GetCUDAStruct(),
+        cuda_larger_leaf_splits_->GetCUDAStruct(),
+        smaller_leaf_index_, larger_leaf_index_,
+        num_data_in_smaller_leaf, num_data_in_larger_leaf,
+        sum_hessians_in_smaller_leaf, sum_hessians_in_larger_leaf,
+        nullptr, nullptr, 0, 0);
+    }
+
     global_timer.Stop("CUDASingleGPUTreeLearner::FindBestSplitsForLeaf");
     global_timer.Start("CUDASingleGPUTreeLearner::FindBestFromAllSplits");
     const CUDASplitInfo* best_split_info = nullptr;
@@ -247,9 +320,19 @@ Tree* CUDASingleGPUTreeLearner::Train(const score_t* gradients,
     #endif  // DEBUG
     smaller_leaf_index_ = (leaf_num_data_[best_leaf_index_] < leaf_num_data_[right_leaf_index] ? best_leaf_index_ : right_leaf_index);
     larger_leaf_index_ = (smaller_leaf_index_ == best_leaf_index_ ? right_leaf_index : best_leaf_index_);
+
+    if (config_->use_quantized_grad) {
+      cuda_gradient_discretizer_->SetNumBitsInHistogramBin<false>(
+        best_leaf_index_, right_leaf_index, leaf_num_data_[best_leaf_index_], leaf_num_data_[right_leaf_index]);
+    }
     global_timer.Stop("CUDASingleGPUTreeLearner::Split");
   }
   SynchronizeCUDADevice(__FILE__, __LINE__);
+  if (config_->use_quantized_grad && config_->quant_train_renew_leaf) {
+    global_timer.Start("CUDASingleGPUTreeLearner::RenewDiscretizedTreeLeaves");
+    RenewDiscretizedTreeLeaves(tree.get());
+    global_timer.Stop("CUDASingleGPUTreeLearner::RenewDiscretizedTreeLeaves");
+  }
   tree->ToHost();
   return tree.release();
 }
@@ -322,7 +405,7 @@ void CUDASingleGPUTreeLearner::RenewTreeOutput(Tree* tree, const ObjectiveFuncti
       }
       std::vector<int> n_nozeroworker_perleaf(cuda_tree->num_leaves(), 1);
       int num_machines = Network::num_machines();
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
       for (int i = 0; i < cuda_tree->num_leaves(); ++i) {
         const double output = static_cast<double>(cuda_tree->LeafOutput(i));
         data_size_t cnt_leaf_data = leaf_num_data_[i];
@@ -357,8 +440,8 @@ void CUDASingleGPUTreeLearner::RenewTreeOutput(Tree* tree, const ObjectiveFuncti
 
 Tree* CUDASingleGPUTreeLearner::FitByExistingTree(const Tree* old_tree, const score_t* gradients, const score_t* hessians) const {
   std::unique_ptr<CUDATree> cuda_tree(new CUDATree(old_tree));
-  SetCUDAMemory<double>(cuda_leaf_gradient_stat_buffer_, 0, static_cast<size_t>(old_tree->num_leaves()), __FILE__, __LINE__);
-  SetCUDAMemory<double>(cuda_leaf_hessian_stat_buffer_, 0, static_cast<size_t>(old_tree->num_leaves()), __FILE__, __LINE__);
+  cuda_leaf_gradient_stat_buffer_.SetValue(0);
+  cuda_leaf_hessian_stat_buffer_.SetValue(0);
   ReduceLeafStat(cuda_tree.get(), gradients, hessians, cuda_data_partition_->cuda_data_indices());
   cuda_tree->SyncLeafOutputFromCUDAToHost();
   return cuda_tree.release();
@@ -373,13 +456,9 @@ Tree* CUDASingleGPUTreeLearner::FitByExistingTree(const Tree* old_tree, const st
     const int num_block = (refit_num_data_ + CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE - 1) / CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE;
     buffer_size *= static_cast<data_size_t>(num_block + 1);
   }
-  if (buffer_size != leaf_stat_buffer_size_) {
-    if (leaf_stat_buffer_size_ != 0) {
-      DeallocateCUDAMemory<double>(&cuda_leaf_gradient_stat_buffer_, __FILE__, __LINE__);
-      DeallocateCUDAMemory<double>(&cuda_leaf_hessian_stat_buffer_, __FILE__, __LINE__);
-    }
-    AllocateCUDAMemory<double>(&cuda_leaf_gradient_stat_buffer_, static_cast<size_t>(buffer_size), __FILE__, __LINE__);
-    AllocateCUDAMemory<double>(&cuda_leaf_hessian_stat_buffer_, static_cast<size_t>(buffer_size), __FILE__, __LINE__);
+  if (static_cast<size_t>(buffer_size) > cuda_leaf_gradient_stat_buffer_.Size()) {
+    cuda_leaf_gradient_stat_buffer_.Resize(buffer_size);
+    cuda_leaf_hessian_stat_buffer_.Resize(buffer_size);
   }
   return FitByExistingTree(old_tree, gradients, hessians);
 }
@@ -513,6 +592,15 @@ void CUDASingleGPUTreeLearner::CheckSplitValid(
 }
 #endif  // DEBUG
 
+void CUDASingleGPUTreeLearner::RenewDiscretizedTreeLeaves(CUDATree* cuda_tree) {
+  cuda_data_partition_->ReduceLeafGradStat(
+    gradients_, hessians_, cuda_tree,
+    cuda_leaf_gradient_stat_buffer_.RawData(),
+    cuda_leaf_hessian_stat_buffer_.RawData());
+  LaunchCalcLeafValuesGivenGradStat(cuda_tree, cuda_data_partition_->cuda_data_indices());
+  SynchronizeCUDADevice(__FILE__, __LINE__);
+}
+
 }  // namespace LightGBM
 
 #endif  // USE_CUDA
diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu
index 8a558ddc43d1..670f1f36d643 100644
--- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu
+++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.cu
@@ -129,18 +129,18 @@ void CUDASingleGPUTreeLearner::LaunchReduceLeafStatKernel(
   if (num_leaves <= 2048) {
     ReduceLeafStatKernel_SharedMemory<<<num_block, CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE, 2 * num_leaves * sizeof(double)>>>(
       gradients, hessians, num_leaves, num_data, cuda_data_partition_->cuda_data_index_to_leaf_index(),
-      cuda_leaf_gradient_stat_buffer_, cuda_leaf_hessian_stat_buffer_);
+      cuda_leaf_gradient_stat_buffer_.RawData(), cuda_leaf_hessian_stat_buffer_.RawData());
   } else {
     ReduceLeafStatKernel_GlobalMemory<<<num_block, CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE>>>(
       gradients, hessians, num_leaves, num_data, cuda_data_partition_->cuda_data_index_to_leaf_index(),
-      cuda_leaf_gradient_stat_buffer_, cuda_leaf_hessian_stat_buffer_);
+      cuda_leaf_gradient_stat_buffer_.RawData(), cuda_leaf_hessian_stat_buffer_.RawData());
   }
   const bool use_l1 = config_->lambda_l1 > 0.0f;
   const bool use_smoothing = config_->path_smooth > 0.0f;
   num_block = (num_leaves + CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE - 1) / CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE;
 
   #define CalcRefitLeafOutputKernel_ARGS \
-    num_leaves, cuda_leaf_gradient_stat_buffer_, cuda_leaf_hessian_stat_buffer_, num_data_in_leaf, \
+    num_leaves, cuda_leaf_gradient_stat_buffer_.RawData(), cuda_leaf_hessian_stat_buffer_.RawData(), num_data_in_leaf, \
     leaf_parent, left_child, right_child, \
     config_->lambda_l1, config_->lambda_l2, config_->path_smooth, \
     shrinkage_rate, config_->refit_decay_rate, cuda_leaf_value
@@ -162,6 +162,7 @@ void CUDASingleGPUTreeLearner::LaunchReduceLeafStatKernel(
         <<<num_block, CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE>>>(CalcRefitLeafOutputKernel_ARGS);
     }
   }
+  #undef CalcRefitLeafOutputKernel_ARGS
 }
 
 template <typename T, bool IS_INNER>
@@ -256,6 +257,37 @@ void CUDASingleGPUTreeLearner::LaunchConstructBitsetForCategoricalSplitKernel(
   CUDAConstructBitset<int, false>(best_split_info, num_cat_threshold_, cuda_bitset_, cuda_bitset_len_);
 }
 
+void CUDASingleGPUTreeLearner::LaunchCalcLeafValuesGivenGradStat(
+  CUDATree* cuda_tree, const data_size_t* num_data_in_leaf) {
+  #define CalcRefitLeafOutputKernel_ARGS \
+    cuda_tree->num_leaves(), cuda_leaf_gradient_stat_buffer_.RawData(), cuda_leaf_hessian_stat_buffer_.RawData(), num_data_in_leaf, \
+    cuda_tree->cuda_leaf_parent(), cuda_tree->cuda_left_child(), cuda_tree->cuda_right_child(), \
+    config_->lambda_l1, config_->lambda_l2, config_->path_smooth, \
+    1.0f, config_->refit_decay_rate, cuda_tree->cuda_leaf_value_ref()
+  const bool use_l1 = config_->lambda_l1 > 0.0f;
+  const bool use_smoothing = config_->path_smooth > 0.0f;
+  const int num_block = (cuda_tree->num_leaves() + CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE - 1) / CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE;
+  if (!use_l1) {
+    if (!use_smoothing) {
+      CalcRefitLeafOutputKernel<false, false>
+        <<<num_block, CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE>>>(CalcRefitLeafOutputKernel_ARGS);
+    } else {
+      CalcRefitLeafOutputKernel<false, true>
+        <<<num_block, CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE>>>(CalcRefitLeafOutputKernel_ARGS);
+    }
+  } else {
+    if (!use_smoothing) {
+      CalcRefitLeafOutputKernel<true, false>
+        <<<num_block, CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE>>>(CalcRefitLeafOutputKernel_ARGS);
+    } else {
+      CalcRefitLeafOutputKernel<true, true>
+        <<<num_block, CUDA_SINGLE_GPU_TREE_LEARNER_BLOCK_SIZE>>>(CalcRefitLeafOutputKernel_ARGS);
+    }
+  }
+
+  #undef CalcRefitLeafOutputKernel_ARGS
+}
+
 }  // namespace LightGBM
 
 #endif  // USE_CUDA
diff --git a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp
index 576d01ffe5cc..a1ea79efa1a1 100644
--- a/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp
+++ b/src/treelearner/cuda/cuda_single_gpu_tree_learner.hpp
@@ -16,6 +16,7 @@
 #include "cuda_data_partition.hpp"
 #include "cuda_best_split_finder.hpp"
 
+#include "cuda_gradient_discretizer.hpp"
 #include "../serial_tree_learner.h"
 
 namespace LightGBM {
@@ -74,6 +75,10 @@ class CUDASingleGPUTreeLearner: public SerialTreeLearner {
     const double sum_left_gradients, const double sum_right_gradients);
   #endif  // DEBUG
 
+  void RenewDiscretizedTreeLeaves(CUDATree* cuda_tree);
+
+  void LaunchCalcLeafValuesGivenGradStat(CUDATree* cuda_tree, const data_size_t* num_data_in_leaf);
+
   // GPU device ID
   int gpu_device_id_;
   // number of threads on CPU
@@ -90,6 +95,8 @@ class CUDASingleGPUTreeLearner: public SerialTreeLearner {
   std::unique_ptr<CUDAHistogramConstructor> cuda_histogram_constructor_;
   // for best split information finding, given the histograms
   std::unique_ptr<CUDABestSplitFinder> cuda_best_split_finder_;
+  // gradient discretizer for quantized training
+  std::unique_ptr<CUDAGradientDiscretizer> cuda_gradient_discretizer_;
 
   std::vector<int> leaf_best_split_feature_;
   std::vector<uint32_t> leaf_best_split_threshold_;
@@ -108,8 +115,8 @@ class CUDASingleGPUTreeLearner: public SerialTreeLearner {
   std::vector<int> categorical_bin_to_value_;
   std::vector<int> categorical_bin_offsets_;
 
-  mutable double* cuda_leaf_gradient_stat_buffer_;
-  mutable double* cuda_leaf_hessian_stat_buffer_;
+  mutable CUDAVector<double> cuda_leaf_gradient_stat_buffer_;
+  mutable CUDAVector<double> cuda_leaf_hessian_stat_buffer_;
   mutable data_size_t leaf_stat_buffer_size_;
   mutable data_size_t refit_num_data_;
   uint32_t* cuda_bitset_;
@@ -148,7 +155,7 @@ class CUDASingleGPUTreeLearner: public SerialTreeLearner {
     #pragma warning(disable : 4702)
     explicit CUDASingleGPUTreeLearner(const Config* tree_config, const bool /*boosting_on_cuda*/) : SerialTreeLearner(tree_config) {
       Log::Fatal("CUDA Tree Learner was not enabled in this build.\n"
-                 "Please recompile with CMake option -DUSE_CUDAP=1");
+                 "Please recompile with CMake option -DUSE_CUDA=1");
     }
 };
 
diff --git a/src/treelearner/data_parallel_tree_learner.cpp b/src/treelearner/data_parallel_tree_learner.cpp
index 2509db5e722a..64c342e5b01d 100644
--- a/src/treelearner/data_parallel_tree_learner.cpp
+++ b/src/treelearner/data_parallel_tree_learner.cpp
@@ -228,7 +228,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplits(const Tree* tree) {
   if (local_data_on_smaller_leaf <= 0) {
     // clear histogram buffer before synchronizing
     // otherwise histogram contents from the previous iteration will be sent
-    #pragma omp parallel for schedule(static)
+    #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
     for (int feature_index = 0; feature_index < this->num_features_; ++feature_index) {
       if (this->col_sampler_.is_feature_used_bytree()[feature_index] == false)
         continue;
@@ -249,7 +249,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplits(const Tree* tree) {
   // construct local histograms
   global_timer.Start("DataParallelTreeLearner::ReduceHistogram");
   global_timer.Start("DataParallelTreeLearner::ReduceHistogram::Copy");
-  #pragma omp parallel for schedule(static)
+  #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
   for (int feature_index = 0; feature_index < this->num_features_; ++feature_index) {
     if (this->col_sampler_.is_feature_used_bytree()[feature_index] == false)
       continue;
@@ -318,7 +318,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(const
     if (parent_num_bits > 16 && larger_leaf_num_bits <= 16) {
       CHECK_LE(smaller_leaf_num_bits, 16);
       OMP_INIT_EX();
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
       for (int feature_index = 0; feature_index < this->num_features_; ++feature_index) {
         OMP_LOOP_EX_BEGIN();
         if (!is_feature_aggregated_[feature_index]) continue;
@@ -330,7 +330,7 @@ void DataParallelTreeLearner<TREELEARNER_T>::FindBestSplitsFromHistograms(const
   }
 
   OMP_INIT_EX();
-  #pragma omp parallel for schedule(static)
+  #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
   for (int feature_index = 0; feature_index < this->num_features_; ++feature_index) {
     OMP_LOOP_EX_BEGIN();
     if (!is_feature_aggregated_[feature_index]) continue;
diff --git a/src/treelearner/data_partition.hpp b/src/treelearner/data_partition.hpp
index 7a6ac031e62d..25c39c3ebde2 100644
--- a/src/treelearner/data_partition.hpp
+++ b/src/treelearner/data_partition.hpp
@@ -52,7 +52,7 @@ class DataPartition {
     if (used_data_indices_ == nullptr) {
       // if using all data
       leaf_count_[0] = num_data_;
-#pragma omp parallel for schedule(static, 512) if (num_data_ >= 1024)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_data_ >= 1024)
       for (data_size_t i = 0; i < num_data_; ++i) {
         indices_[i] = i;
       }
diff --git a/src/treelearner/feature_histogram.hpp b/src/treelearner/feature_histogram.hpp
index d917ed7917ec..bd5bda1e8879 100644
--- a/src/treelearner/feature_histogram.hpp
+++ b/src/treelearner/feature_histogram.hpp
@@ -1692,7 +1692,7 @@ class HistogramPool {
     auto& ref_feature_meta = *feature_meta;
     const int num_feature = train_data->num_features();
     ref_feature_meta.resize(num_feature);
-#pragma omp parallel for schedule(static, 512) if (num_feature >= 1024)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) if (num_feature >= 1024)
     for (int i = 0; i < num_feature; ++i) {
       if (USE_DATA) {
         ref_feature_meta[i].num_bin = train_data->FeatureNumBin(i);
@@ -1749,7 +1749,7 @@ class HistogramPool {
 
     if (config->use_quantized_grad) {
       OMP_INIT_EX();
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
       for (int i = old_cache_size; i < cache_size; ++i) {
         OMP_LOOP_EX_BEGIN();
         pool_[i].reset(new FeatureHistogram[train_data->num_features()]);
@@ -1763,7 +1763,7 @@ class HistogramPool {
       OMP_THROW_EX();
     } else {
       OMP_INIT_EX();
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
       for (int i = old_cache_size; i < cache_size; ++i) {
         OMP_LOOP_EX_BEGIN();
         pool_[i].reset(new FeatureHistogram[train_data->num_features()]);
@@ -1787,7 +1787,7 @@ class HistogramPool {
         old_config->extra_trees != config->extra_trees ||
         old_config->max_delta_step != config->max_delta_step ||
         old_config->path_smooth != config->path_smooth) {
-#pragma omp parallel for schedule(static)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
       for (int i = 0; i < cache_size_; ++i) {
         for (int j = 0; j < train_data->num_features(); ++j) {
           pool_[i][j].ResetFunc();
diff --git a/src/treelearner/gpu_tree_learner.cpp b/src/treelearner/gpu_tree_learner.cpp
index 294be28b6f86..0817614483b2 100644
--- a/src/treelearner/gpu_tree_learner.cpp
+++ b/src/treelearner/gpu_tree_learner.cpp
@@ -191,7 +191,7 @@ void GPUTreeLearner::WaitAndGetHistograms(hist_t* histograms) {
   HistType* hist_outputs = reinterpret_cast<HistType*>(host_histogram_outputs_);
   // when the output is ready, the computation is done
   histograms_wait_obj_.wait();
-  #pragma omp parallel for schedule(static)
+  #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
   for (int i = 0; i < num_dense_feature_groups_; ++i) {
     if (!feature_masks_[i]) {
       continue;
@@ -344,7 +344,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
   // for data transfer time
   auto start_time = std::chrono::steady_clock::now();
   // Now generate new data structure feature4, and copy data to the device
-  int nthreads = std::min(omp_get_max_threads(), static_cast<int>(dense_feature_group_map_.size()) / dword_features_);
+  int nthreads = std::min(OMP_NUM_THREADS(), static_cast<int>(dense_feature_group_map_.size()) / dword_features_);
   nthreads = std::max(nthreads, 1);
   std::vector<Feature4*> host4_vecs(nthreads);
   std::vector<boost::compute::buffer> host4_bufs(nthreads);
@@ -359,7 +359,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
                     0, num_data_ * sizeof(Feature4)));
   }
   // building Feature4 bundles; each thread handles dword_features_ features
-  #pragma omp parallel for schedule(static)
+  #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
   for (int i = 0; i < static_cast<int>(dense_feature_group_map_.size() / dword_features_); ++i) {
     int tid = omp_get_thread_num();
     Feature4* host4 = host4_ptrs[tid];
@@ -451,7 +451,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
         BinIterator* bin_iter = train_data_->FeatureGroupIterator(dense_dword_ind[i]);
         if (dynamic_cast<DenseBinIterator<uint8_t, true>*>(bin_iter) != 0) {
           DenseBinIterator<uint8_t, true> iter = *static_cast<DenseBinIterator<uint8_t, true>*>(bin_iter);
-          #pragma omp parallel for schedule(static)
+          #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
           for (int j = 0; j < num_data_; ++j) {
             host4[j].s[i >> 1] |= (uint8_t)((iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i]
                                 + ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1)))
@@ -464,14 +464,14 @@ void GPUTreeLearner::AllocateGPUMemory() {
         BinIterator* bin_iter = train_data_->FeatureGroupIterator(dense_dword_ind[i]);
         if (dynamic_cast<DenseBinIterator<uint8_t, false>*>(bin_iter) != 0) {
           DenseBinIterator<uint8_t, false> iter = *static_cast<DenseBinIterator<uint8_t, false>*>(bin_iter);
-          #pragma omp parallel for schedule(static)
+          #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
           for (int j = 0; j < num_data_; ++j) {
             host4[j].s[i] = (uint8_t)(iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i]
                           + ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1)));
           }
         } else if (dynamic_cast<DenseBinIterator<uint8_t, true>*>(bin_iter) != 0) {
           DenseBinIterator<uint8_t, true> iter = *static_cast<DenseBinIterator<uint8_t, true>*>(bin_iter);
-          #pragma omp parallel for schedule(static)
+          #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
           for (int j = 0; j < num_data_; ++j) {
             host4[j].s[i] = (uint8_t)(iter.RawGet(j) * device_bin_mults_[copied_feature4 * dword_features_ + i]
                           + ((j+i) & (device_bin_mults_[copied_feature4 * dword_features_ + i] - 1)));
@@ -485,7 +485,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
     }
     // fill the leftover features
     if (dword_features_ == 8) {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
       for (int j = 0; j < num_data_; ++j) {
         for (int i = k; i < dword_features_; ++i) {
           // fill this empty feature with some "random" value
@@ -493,7 +493,7 @@ void GPUTreeLearner::AllocateGPUMemory() {
         }
       }
     } else if (dword_features_ == 4) {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
       for (int j = 0; j < num_data_; ++j) {
         for (int i = k; i < dword_features_; ++i) {
           // fill this empty feature with some "random" value
@@ -572,7 +572,7 @@ void GPUTreeLearner::BuildGPUKernels() {
   // currently we don't use constant memory
   int use_constants = 0;
   OMP_INIT_EX();
-  #pragma omp parallel for schedule(guided)
+  #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(guided)
   for (int i = 0; i <= kMaxLogWorkgroupsPerFeature; ++i) {
     OMP_LOOP_EX_BEGIN();
     boost::compute::program program;
@@ -811,7 +811,7 @@ void GPUTreeLearner::BeforeTrain() {
     // transfer the indices to GPU
     indices_future_ = boost::compute::copy_async(indices, indices + cnt, device_data_indices_->begin(), queue_);
     if (!share_state_->is_constant_hessian) {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
       for (data_size_t i = 0; i < cnt; ++i) {
         ordered_hessians_[i] = hessians_[indices[i]];
       }
@@ -827,7 +827,7 @@ void GPUTreeLearner::BeforeTrain() {
         histogram_fulldata_kernels_[i].set_arg(6, const_hessian);
       }
     }
-    #pragma omp parallel for schedule(static)
+    #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
     for (data_size_t i = 0; i < cnt; ++i) {
       ordered_gradients_[i] = gradients_[indices[i]];
     }
@@ -865,7 +865,7 @@ bool GPUTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int ri
     indices_future_ = boost::compute::copy_async(indices + begin, indices + end, device_data_indices_->begin(), queue_);
 
     if (!share_state_->is_constant_hessian) {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
       for (data_size_t i = begin; i < end; ++i) {
         ordered_hessians_[i - begin] = hessians_[indices[i]];
       }
@@ -873,7 +873,7 @@ bool GPUTreeLearner::BeforeFindBestSplit(const Tree* tree, int left_leaf, int ri
       hessians_future_ = queue_.enqueue_write_buffer_async(device_hessians_, 0, (end - begin) * sizeof(score_t), ptr_pinned_hessians_);
     }
 
-    #pragma omp parallel for schedule(static)
+    #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
     for (data_size_t i = begin; i < end; ++i) {
       ordered_gradients_[i - begin] = gradients_[indices[i]];
     }
@@ -907,7 +907,7 @@ bool GPUTreeLearner::ConstructGPUHistogramsAsync(
   // generate and copy ordered_gradients if gradients is not null
   if (gradients != nullptr) {
     if (num_data != num_data_) {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
       for (data_size_t i = 0; i < num_data; ++i) {
         ordered_gradients[i] = gradients[data_indices[i]];
       }
@@ -919,7 +919,7 @@ bool GPUTreeLearner::ConstructGPUHistogramsAsync(
   // generate and copy ordered_hessians if Hessians is not null
   if (hessians != nullptr && !share_state_->is_constant_hessian) {
     if (num_data != num_data_) {
-      #pragma omp parallel for schedule(static)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
       for (data_size_t i = 0; i < num_data; ++i) {
         ordered_hessians[i] = hessians[data_indices[i]];
       }
@@ -930,7 +930,7 @@ bool GPUTreeLearner::ConstructGPUHistogramsAsync(
   }
   // converted indices in is_feature_used to feature-group indices
   std::vector<int8_t> is_feature_group_used(num_feature_groups_, 0);
-  #pragma omp parallel for schedule(static, 1024) if (num_features_ >= 2048)
+  #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 1024) if (num_features_ >= 2048)
   for (int i = 0; i < num_features_; ++i) {
     if (is_feature_used[i]) {
       is_feature_group_used[train_data_->Feature2Group(i)] = 1;
@@ -938,7 +938,7 @@ bool GPUTreeLearner::ConstructGPUHistogramsAsync(
   }
   // construct the feature masks for dense feature-groups
   int used_dense_feature_groups = 0;
-  #pragma omp parallel for schedule(static, 1024) reduction(+:used_dense_feature_groups) if (num_dense_feature_groups_ >= 2048)
+  #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 1024) reduction(+:used_dense_feature_groups) if (num_dense_feature_groups_ >= 2048)
   for (int i = 0; i < num_dense_feature_groups_; ++i) {
     if (is_feature_group_used[dense_feature_group_map_[i]]) {
       feature_masks_[i] = 1;
@@ -973,7 +973,7 @@ bool GPUTreeLearner::ConstructGPUHistogramsAsync(
 void GPUTreeLearner::ConstructHistograms(const std::vector<int8_t>& is_feature_used, bool use_subtract) {
   std::vector<int8_t> is_sparse_feature_used(num_features_, 0);
   std::vector<int8_t> is_dense_feature_used(num_features_, 0);
-  #pragma omp parallel for schedule(static)
+  #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
   for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
     if (!col_sampler_.is_feature_used_bytree()[feature_index]) continue;
     if (!is_feature_used[feature_index]) continue;
diff --git a/src/treelearner/gradient_discretizer.cpp b/src/treelearner/gradient_discretizer.cpp
index 4c00f73ab12c..b7e2e814b74a 100644
--- a/src/treelearner/gradient_discretizer.cpp
+++ b/src/treelearner/gradient_discretizer.cpp
@@ -216,7 +216,7 @@ void GradientDiscretizer::RenewIntGradTreeOutput(
       data_size_t leaf_cnt = 0;
       const data_size_t* data_indices = data_partition->GetIndexOnLeaf(leaf_id, &leaf_cnt);
       double sum_gradient = 0.0f, sum_hessian = 0.0f;
-      #pragma omp parallel for schedule(static) reduction(+:sum_gradient, sum_hessian)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_gradient, sum_hessian)
       for (data_size_t i = 0; i < leaf_cnt; ++i) {
         const data_size_t index = data_indices[i];
         const score_t grad = gradients[index];
@@ -242,7 +242,7 @@ void GradientDiscretizer::RenewIntGradTreeOutput(
       data_size_t leaf_cnt = 0;
       const data_size_t* data_indices = data_partition->GetIndexOnLeaf(leaf_id, &leaf_cnt);
       double sum_gradient = 0.0f, sum_hessian = 0.0f;
-      #pragma omp parallel for schedule(static) reduction(+:sum_gradient, sum_hessian)
+      #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) reduction(+:sum_gradient, sum_hessian)
       for (data_size_t i = 0; i < leaf_cnt; ++i) {
         const data_size_t index = data_indices[i];
         const score_t grad = gradients[index];
diff --git a/src/treelearner/leaf_splits.hpp b/src/treelearner/leaf_splits.hpp
index fdf55693a0e9..2db71573e97e 100644
--- a/src/treelearner/leaf_splits.hpp
+++ b/src/treelearner/leaf_splits.hpp
@@ -95,7 +95,7 @@ class LeafSplits {
     data_indices_ = nullptr;
     double tmp_sum_gradients = 0.0f;
     double tmp_sum_hessians = 0.0f;
-#pragma omp parallel for schedule(static, 512) reduction(+:tmp_sum_gradients, tmp_sum_hessians) if (num_data_in_leaf_ >= 1024 && !deterministic_)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) reduction(+:tmp_sum_gradients, tmp_sum_hessians) if (num_data_in_leaf_ >= 1024 && !deterministic_)
     for (data_size_t i = 0; i < num_data_in_leaf_; ++i) {
       tmp_sum_gradients += gradients[i];
       tmp_sum_hessians += hessians[i];
@@ -120,7 +120,7 @@ class LeafSplits {
     double tmp_sum_hessians = 0.0f;
     const int16_t* packed_int_gradients_and_hessians = reinterpret_cast<const int16_t*>(int_gradients_and_hessians);
     int64_t tmp_sum_gradients_and_hessians = 0;
-#pragma omp parallel for schedule(static, 512) reduction(+:tmp_sum_gradients, tmp_sum_hessians, tmp_sum_gradients_and_hessians) if (num_data_in_leaf_ >= 1024 && !deterministic_)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) reduction(+:tmp_sum_gradients, tmp_sum_hessians, tmp_sum_gradients_and_hessians) if (num_data_in_leaf_ >= 1024 && !deterministic_)
     for (data_size_t i = 0; i < num_data_in_leaf_; ++i) {
       tmp_sum_gradients += int_gradients_and_hessians[2 * i + 1] * grad_scale;
       tmp_sum_hessians += int_gradients_and_hessians[2 * i] * hess_scale;
@@ -149,7 +149,7 @@ class LeafSplits {
     data_indices_ = data_partition->GetIndexOnLeaf(leaf, &num_data_in_leaf_);
     double tmp_sum_gradients = 0.0f;
     double tmp_sum_hessians = 0.0f;
-#pragma omp parallel for schedule(static, 512) reduction(+:tmp_sum_gradients, tmp_sum_hessians) if (num_data_in_leaf_ >= 1024 && !deterministic_)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) reduction(+:tmp_sum_gradients, tmp_sum_hessians) if (num_data_in_leaf_ >= 1024 && !deterministic_)
     for (data_size_t i = 0; i < num_data_in_leaf_; ++i) {
       const data_size_t idx = data_indices_[i];
       tmp_sum_gradients += gradients[idx];
@@ -177,7 +177,7 @@ class LeafSplits {
     double tmp_sum_hessians = 0.0f;
     const int16_t* packed_int_gradients_and_hessians = reinterpret_cast<const int16_t*>(int_gradients_and_hessians);
     int64_t tmp_sum_gradients_and_hessians = 0;
-#pragma omp parallel for schedule(static, 512) reduction(+:tmp_sum_gradients, tmp_sum_hessians, tmp_sum_gradients_and_hessians) if (num_data_in_leaf_ >= 1024 && deterministic_)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 512) reduction(+:tmp_sum_gradients, tmp_sum_hessians, tmp_sum_gradients_and_hessians) if (num_data_in_leaf_ >= 1024 && deterministic_)
     for (data_size_t i = 0; i < num_data_in_leaf_; ++i) {
       const data_size_t idx = data_indices_[i];
       tmp_sum_gradients += int_gradients_and_hessians[2 * idx + 1] * grad_scale;
diff --git a/src/treelearner/linear_tree_learner.cpp b/src/treelearner/linear_tree_learner.cpp
index 071a6405709c..c96bce64d644 100644
--- a/src/treelearner/linear_tree_learner.cpp
+++ b/src/treelearner/linear_tree_learner.cpp
@@ -19,7 +19,7 @@ void LinearTreeLearner::InitLinear(const Dataset* train_data, const int max_leav
   leaf_map_ = std::vector<int>(train_data->num_data(), -1);
   contains_nan_ = std::vector<int8_t>(train_data->num_features(), 0);
   // identify features containing nans
-#pragma omp parallel for schedule(static)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
   for (int feat = 0; feat < train_data->num_features(); ++feat) {
     auto bin_mapper = train_data_->FeatureBinMapper(feat);
     if (bin_mapper->bin_type() == BinType::NumericalBin) {
@@ -52,7 +52,7 @@ void LinearTreeLearner::InitLinear(const Dataset* train_data, const int max_leav
   }
   XTHX_by_thread_.clear();
   XTg_by_thread_.clear();
-  int max_threads = omp_get_max_threads();
+  int max_threads = OMP_NUM_THREADS();
   for (int i = 0; i < max_threads; ++i) {
     XTHX_by_thread_.push_back(XTHX_);
     XTg_by_thread_.push_back(XTg_);
@@ -159,7 +159,7 @@ void LinearTreeLearner::GetLeafMap(Tree* tree) const {
   std::fill(leaf_map_.begin(), leaf_map_.end(), -1);
   // map data to leaf number
   const data_size_t* ind = data_partition_->indices();
-#pragma omp parallel for schedule(dynamic)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(dynamic)
   for (int i = 0; i < tree->num_leaves(); ++i) {
     data_size_t idx = data_partition_->leaf_begin(i);
     for (int j = 0; j < data_partition_->leaf_count(i); ++j) {
@@ -224,7 +224,7 @@ void LinearTreeLearner::CalculateLinear(Tree* tree, bool is_refit, const score_t
     }
   }
   // clear the coefficient matrices
-#pragma omp parallel for schedule(static)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
   for (int i = 0; i < num_threads; ++i) {
     for (int leaf_num = 0; leaf_num < num_leaves; ++leaf_num) {
       size_t num_feat = leaf_features[leaf_num].size();
@@ -232,7 +232,7 @@ void LinearTreeLearner::CalculateLinear(Tree* tree, bool is_refit, const score_t
       std::fill(XTg_by_thread_[i][leaf_num].begin(), XTg_by_thread_[i][leaf_num].begin() + num_feat + 1, 0.0f);
     }
   }
-#pragma omp parallel for schedule(static)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
   for (int leaf_num = 0; leaf_num < num_leaves; ++leaf_num) {
     size_t num_feat = leaf_features[leaf_num].size();
     std::fill(XTHX_[leaf_num].begin(), XTHX_[leaf_num].begin() + (num_feat + 1) * (num_feat + 2) / 2, 0.0f);
@@ -245,7 +245,7 @@ void LinearTreeLearner::CalculateLinear(Tree* tree, bool is_refit, const score_t
     }
   }
   OMP_INIT_EX();
-#pragma omp parallel if (num_data_ > 1024)
+#pragma omp parallel num_threads(OMP_NUM_THREADS()) if (num_data_ > 1024)
   {
     std::vector<float> curr_row(max_num_features + 1);
     int tid = omp_get_thread_num();
@@ -296,7 +296,7 @@ void LinearTreeLearner::CalculateLinear(Tree* tree, bool is_refit, const score_t
   auto total_nonzero = std::vector<int>(tree->num_leaves());
   // aggregate results from different threads
   for (int tid = 0; tid < num_threads; ++tid) {
-#pragma omp parallel for schedule(static)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
     for (int leaf_num = 0; leaf_num < num_leaves; ++leaf_num) {
       size_t num_feat = leaf_features[leaf_num].size();
       for (size_t j = 0; j < (num_feat + 1) * (num_feat + 2) / 2; ++j) {
@@ -318,7 +318,7 @@ void LinearTreeLearner::CalculateLinear(Tree* tree, bool is_refit, const score_t
   double shrinkage = tree->shrinkage();
   double decay_rate = config_->refit_decay_rate;
   // copy into eigen matrices and solve
-#pragma omp parallel for schedule(static)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
   for (int leaf_num = 0; leaf_num < num_leaves; ++leaf_num) {
     if (total_nonzero[leaf_num] < static_cast<int>(leaf_features[leaf_num].size()) + 1) {
       if (is_refit) {
diff --git a/src/treelearner/linear_tree_learner.h b/src/treelearner/linear_tree_learner.h
index 7e7b1f6d2a8d..770b18d133e5 100644
--- a/src/treelearner/linear_tree_learner.h
+++ b/src/treelearner/linear_tree_learner.h
@@ -75,7 +75,7 @@ class LinearTreeLearner: public SerialTreeLearner {
       leaf_num_features[leaf_num] = static_cast<int>(feat_ptr[leaf_num].size());
     }
     OMP_INIT_EX();
-#pragma omp parallel for schedule(static) if (num_data_ > 1024)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static) if (num_data_ > 1024)
     for (int i = 0; i < num_data_; ++i) {
       OMP_LOOP_EX_BEGIN();
       int leaf_num = leaf_map_[i];
diff --git a/src/treelearner/serial_tree_learner.cpp b/src/treelearner/serial_tree_learner.cpp
index 37d9a2a50713..d5c5cc59ef3a 100644
--- a/src/treelearner/serial_tree_learner.cpp
+++ b/src/treelearner/serial_tree_learner.cpp
@@ -242,7 +242,7 @@ Tree* SerialTreeLearner::FitByExistingTree(const Tree* old_tree, const score_t*
   auto tree = std::unique_ptr<Tree>(new Tree(*old_tree));
   CHECK_GE(data_partition_->num_leaves(), tree->num_leaves());
   OMP_INIT_EX();
-  #pragma omp parallel for schedule(static)
+  #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
   for (int i = 0; i < tree->num_leaves(); ++i) {
     OMP_LOOP_EX_BEGIN();
     data_size_t cnt_leaf_data = 0;
@@ -379,7 +379,7 @@ void SerialTreeLearner::FindBestSplits(const Tree* tree) {
 
 void SerialTreeLearner::FindBestSplits(const Tree* tree, const std::set<int>* force_features) {
   std::vector<int8_t> is_feature_used(num_features_, 0);
-  #pragma omp parallel for schedule(static, 256) if (num_features_ >= 512)
+  #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 256) if (num_features_ >= 512)
   for (int feature_index = 0; feature_index < num_features_; ++feature_index) {
     if (!col_sampler_.is_feature_used_bytree()[feature_index] && (force_features == nullptr || force_features->find(feature_index) == force_features->end())) continue;
     if (parent_leaf_histogram_array_ != nullptr
@@ -922,7 +922,7 @@ void SerialTreeLearner::RenewTreeOutput(Tree* tree, const ObjectiveFunction* obj
     }
     std::vector<int> n_nozeroworker_perleaf(tree->num_leaves(), 1);
     int num_machines = Network::num_machines();
-    #pragma omp parallel for schedule(static)
+    #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
     for (int i = 0; i < tree->num_leaves(); ++i) {
       const double output = static_cast<double>(tree->LeafOutput(i));
       data_size_t cnt_leaf_data = 0;
diff --git a/src/treelearner/serial_tree_learner.h b/src/treelearner/serial_tree_learner.h
index 93e0787a90cf..43ff6a4b1e13 100644
--- a/src/treelearner/serial_tree_learner.h
+++ b/src/treelearner/serial_tree_learner.h
@@ -103,7 +103,7 @@ class SerialTreeLearner: public TreeLearner {
     if (tree->num_leaves() <= 1) {
       return;
     }
-#pragma omp parallel for schedule(static, 1)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static, 1)
     for (int i = 0; i < tree->num_leaves(); ++i) {
       double output = static_cast<double>(tree->LeafOutput(i));
       data_size_t cnt_leaf_data = 0;
diff --git a/src/treelearner/voting_parallel_tree_learner.cpp b/src/treelearner/voting_parallel_tree_learner.cpp
index 0eee032839cd..b88db5a7ba28 100644
--- a/src/treelearner/voting_parallel_tree_learner.cpp
+++ b/src/treelearner/voting_parallel_tree_learner.cpp
@@ -243,7 +243,7 @@ template <typename TREELEARNER_T>
 void VotingParallelTreeLearner<TREELEARNER_T>::FindBestSplits(const Tree* tree) {
   // use local data to find local best splits
   std::vector<int8_t> is_feature_used(this->num_features_, 0);
-#pragma omp parallel for schedule(static)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
   for (int feature_index = 0; feature_index < this->num_features_; ++feature_index) {
     if (!this->col_sampler_.is_feature_used_bytree()[feature_index]) continue;
     if (this->parent_leaf_histogram_array_ != nullptr
@@ -265,7 +265,7 @@ void VotingParallelTreeLearner<TREELEARNER_T>::FindBestSplits(const Tree* tree)
     // clear histogram buffer before synchronizing
     // otherwise histogram contents from the previous iteration will be sent
     OMP_INIT_EX();
-    #pragma omp parallel for schedule(static)
+    #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
     for (int feature_index = 0; feature_index < this->num_features_; ++feature_index) {
       OMP_LOOP_EX_BEGIN();
       if (!is_feature_used[feature_index]) { continue; }
@@ -285,7 +285,7 @@ void VotingParallelTreeLearner<TREELEARNER_T>::FindBestSplits(const Tree* tree)
       const data_size_t local_data_on_larger_leaf = this->data_partition_->leaf_count(larger_leaf_index);
       if (local_data_on_larger_leaf <= 0) {
         OMP_INIT_EX();
-        #pragma omp parallel for schedule(static)
+        #pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
         for (int feature_index = 0; feature_index < this->num_features_; ++feature_index) {
           OMP_LOOP_EX_BEGIN();
           if (!is_feature_used[feature_index]) { continue; }
@@ -307,7 +307,7 @@ void VotingParallelTreeLearner<TREELEARNER_T>::FindBestSplits(const Tree* tree)
   double larger_leaf_parent_output = this->GetParentOutput(tree, this->larger_leaf_splits_.get());
   OMP_INIT_EX();
   // find splits
-#pragma omp parallel for schedule(static)
+#pragma omp parallel for num_threads(OMP_NUM_THREADS()) schedule(static)
   for (int feature_index = 0; feature_index < this->num_features_; ++feature_index) {
     OMP_LOOP_EX_BEGIN();
     if (!is_feature_used[feature_index]) { continue; }
diff --git a/src/utils/openmp_wrapper.cpp b/src/utils/openmp_wrapper.cpp
new file mode 100644
index 000000000000..fb6e661eb67c
--- /dev/null
+++ b/src/utils/openmp_wrapper.cpp
@@ -0,0 +1,44 @@
+/*!
+ * Copyright (c) 2023 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for license information.
+ */
+#include <LightGBM/utils/openmp_wrapper.h>
+
+int LGBM_MAX_NUM_THREADS = -1;
+
+int LGBM_DEFAULT_NUM_THREADS = -1;
+
+#ifdef _OPENMP
+
+#include <omp.h>
+
+int OMP_NUM_THREADS() {
+  int default_num_threads = 1;
+
+  if (LGBM_DEFAULT_NUM_THREADS > 0) {
+    // if LightGBM-specific default has been set, ignore OpenMP-global config
+    default_num_threads = LGBM_DEFAULT_NUM_THREADS;
+  } else {
+    // otherwise, default to OpenMP-global config
+    #pragma omp single
+    { default_num_threads = omp_get_max_threads(); }
+  }
+
+  // ensure that if LGBM_SetMaxThreads() was ever called, LightGBM doesn't
+  // use more than that many threads
+  if (LGBM_MAX_NUM_THREADS > 0 && default_num_threads > LGBM_MAX_NUM_THREADS) {
+    return LGBM_MAX_NUM_THREADS;
+  }
+
+  return default_num_threads;
+}
+
+void OMP_SET_NUM_THREADS(int num_threads) {
+  if (num_threads <= 0) {
+    LGBM_DEFAULT_NUM_THREADS = -1;
+  } else {
+    LGBM_DEFAULT_NUM_THREADS = num_threads;
+  }
+}
+
+#endif  // _OPENMP
diff --git a/tests/c_api_test/test_.py b/tests/c_api_test/test_.py
index 4bb76e4aba19..6cfec1c445fc 100644
--- a/tests/c_api_test/test_.py
+++ b/tests/c_api_test/test_.py
@@ -247,3 +247,36 @@ def test_booster():
         c_str(''),
         c_str('preb.txt'))
     LIB.LGBM_BoosterFree(booster2)
+
+
+def test_max_thread_control():
+    # at initialization, should be -1
+    num_threads = ctypes.c_int(0)
+    ret = LIB.LGBM_GetMaxThreads(
+        ctypes.byref(num_threads)
+    )
+    assert ret == 0
+    assert num_threads.value == -1
+
+    # updating that value through the C API should work
+    ret = LIB.LGBM_SetMaxThreads(
+        ctypes.c_int(6)
+    )
+    assert ret == 0
+
+    ret = LIB.LGBM_GetMaxThreads(
+        ctypes.byref(num_threads)
+    )
+    assert ret == 0
+    assert num_threads.value == 6
+
+    # resetting to any negative number should set it to -1
+    ret = LIB.LGBM_SetMaxThreads(
+        ctypes.c_int(-123)
+    )
+    assert ret == 0
+    ret = LIB.LGBM_GetMaxThreads(
+        ctypes.byref(num_threads)
+    )
+    assert ret == 0
+    assert num_threads.value == -1
diff --git a/tests/cpp_tests/test_arrow.cpp b/tests/cpp_tests/test_arrow.cpp
new file mode 100644
index 000000000000..e975b6ba374b
--- /dev/null
+++ b/tests/cpp_tests/test_arrow.cpp
@@ -0,0 +1,212 @@
+/*!
+ * Copyright (c) 2023 Microsoft Corporation. All rights reserved.
+ * Licensed under the MIT License. See LICENSE file in the project root for license information.
+ *
+ * Author: Oliver Borchert
+ */
+
+#include <gtest/gtest.h>
+#include <LightGBM/arrow.h>
+
+#include <cstdlib>
+#include <cmath>
+
+using LightGBM::ArrowChunkedArray;
+using LightGBM::ArrowTable;
+
+class ArrowChunkedArrayTest : public testing::Test {
+ protected:
+  void SetUp() override {}
+
+  ArrowArray created_nested_array(const std::vector<ArrowArray*>& arrays) {
+    ArrowArray arr;
+    arr.buffers = nullptr;
+    arr.children = (ArrowArray**)arrays.data();  // NOLINT
+    arr.dictionary = nullptr;
+    arr.length = arrays[0]->length;
+    arr.n_buffers = 0;
+    arr.n_children = arrays.size();
+    arr.null_count = 0;
+    arr.offset = 0;
+    arr.private_data = nullptr;
+    arr.release = nullptr;
+    return arr;
+  }
+
+  template <typename T>
+  ArrowArray create_primitive_array(const std::vector<T>& values,
+                                    int64_t offset = 0,
+                                    std::vector<int64_t> null_indices = {}) {
+    // NOTE: Arrow arrays have 64-bit alignment but we can safely ignore this in tests
+    // 1) Create validity bitmap
+    char* validity = nullptr;
+    if (!null_indices.empty()) {
+      auto num_bytes = (values.size() + 7) / 8;
+      validity = static_cast<char*>(calloc(num_bytes, sizeof(char)));
+      memset(validity, 0xff, num_bytes * sizeof(char));
+      for (size_t i = 0; i < values.size(); ++i) {
+        if (std::find(null_indices.begin(), null_indices.end(), i) != null_indices.end()) {
+          validity[i / 8] &= ~(1 << (i % 8));
+        }
+      }
+    }
+
+    // 2) Create buffers
+    const void** buffers = (const void**)malloc(sizeof(void*) * 2);
+    buffers[0] = validity;
+    buffers[1] = values.data() + offset;
+
+    // Create arrow array
+    ArrowArray arr;
+    arr.buffers = buffers;
+    arr.children = nullptr;
+    arr.dictionary = nullptr;
+    arr.length = values.size() - offset;
+    arr.null_count = 0;
+    arr.offset = 0;
+    arr.private_data = nullptr;
+    arr.release = [](ArrowArray* arr) {
+      if (arr->buffers[0] != nullptr)
+        free((void*)(arr->buffers[0]));  // NOLINT
+      free((void*)(arr->buffers));  // NOLINT
+    };
+    return arr;
+  }
+
+  ArrowSchema create_nested_schema(const std::vector<ArrowSchema*>& arrays) {
+    ArrowSchema schema;
+    schema.format = "+s";
+    schema.name = nullptr;
+    schema.metadata = nullptr;
+    schema.flags = 0;
+    schema.n_children = arrays.size();
+    schema.children = (ArrowSchema**)arrays.data();  // NOLINT
+    schema.dictionary = nullptr;
+    schema.private_data = nullptr;
+    schema.release = nullptr;
+    return schema;
+  }
+
+  template <typename T>
+  ArrowSchema create_primitive_schema() {
+    std::logic_error("not implemented");
+  }
+
+  template <>
+  ArrowSchema create_primitive_schema<float>() {
+    ArrowSchema schema;
+    schema.format = "f";
+    schema.name = nullptr;
+    schema.metadata = nullptr;
+    schema.flags = 0;
+    schema.n_children = 0;
+    schema.children = nullptr;
+    schema.dictionary = nullptr;
+    schema.private_data = nullptr;
+    schema.release = nullptr;
+    return schema;
+  }
+};
+
+TEST_F(ArrowChunkedArrayTest, GetLength) {
+  std::vector<float> dat1 = {1, 2};
+  auto arr1 = create_primitive_array(dat1);
+
+  ArrowChunkedArray ca1(1, &arr1, nullptr);
+  ASSERT_EQ(ca1.get_length(), 2);
+
+  std::vector<float> dat2 = {3, 4, 5, 6};
+  auto arr2 = create_primitive_array<float>(dat2);
+  ArrowArray arrs[2] = {arr1, arr2};
+  ArrowChunkedArray ca2(2, arrs, nullptr);
+  ASSERT_EQ(ca2.get_length(), 6);
+
+  arr1.release(&arr1);
+  arr2.release(&arr2);
+}
+
+TEST_F(ArrowChunkedArrayTest, GetColumns) {
+  std::vector<float> dat1 = {1, 2, 3};
+  auto arr1 = create_primitive_array(dat1);
+  std::vector<float> dat2 = {4, 5, 6};
+  auto arr2 = create_primitive_array(dat2);
+  std::vector<ArrowArray*> arrs = {&arr1, &arr2};
+  auto arr = created_nested_array(arrs);
+
+  auto schema1 = create_primitive_schema<float>();
+  auto schema2 = create_primitive_schema<float>();
+  std::vector<ArrowSchema*> schemas = {&schema1, &schema2};
+  auto schema = create_nested_schema(schemas);
+
+  ArrowTable table(1, &arr, &schema);
+  ASSERT_EQ(table.get_num_rows(), 3);
+  ASSERT_EQ(table.get_num_columns(), 2);
+
+  auto ca1 = table.get_column(0);
+  ASSERT_EQ(ca1.get_length(), 3);
+  ASSERT_EQ(*ca1.begin<int32_t>(), 1);
+
+  auto ca2 = table.get_column(1);
+  ASSERT_EQ(ca2.get_length(), 3);
+  ASSERT_EQ(*ca2.begin<int32_t>(), 4);
+
+  arr1.release(&arr1);
+  arr2.release(&arr2);
+}
+
+TEST_F(ArrowChunkedArrayTest, IteratorArithmetic) {
+  std::vector<float> dat1 = {1, 2};
+  auto arr1 = create_primitive_array<float>(dat1);
+  std::vector<float> dat2 = {3, 4, 5, 6};
+  auto arr2 = create_primitive_array<float>(dat2);
+  std::vector<float> dat3 = {7};
+  auto arr3 = create_primitive_array<float>(dat3);
+  auto schema = create_primitive_schema<float>();
+
+  ArrowArray arrs[3] = {arr1, arr2, arr3};
+  ArrowChunkedArray ca(3, arrs, &schema);
+
+  // Arithmetic
+  auto it = ca.begin<int32_t>();
+  ASSERT_EQ(*it, 1);
+  ++it;
+  ASSERT_EQ(*it, 2);
+  ++it;
+  ASSERT_EQ(*it, 3);
+  it += 2;
+  ASSERT_EQ(*it, 5);
+  it += 2;
+  ASSERT_EQ(*it, 7);
+  --it;
+  ASSERT_EQ(*it, 6);
+
+  // Subscripts
+  ASSERT_EQ(it[0], 1);
+  ASSERT_EQ(it[1], 2);
+  ASSERT_EQ(it[2], 3);
+  ASSERT_EQ(it[6], 7);
+
+  // End
+  auto end = ca.end<int32_t>();
+  ASSERT_EQ(end - it, 2);
+  ASSERT_EQ(end - ca.begin<int32_t>(), 7);
+
+  arr1.release(&arr1);
+  arr2.release(&arr2);
+  arr2.release(&arr3);
+}
+
+TEST_F(ArrowChunkedArrayTest, OffsetAndValidity) {
+  std::vector<float> dat = {0, 1, 2, 3, 4, 5, 6};
+  auto arr = create_primitive_array(dat, 2, {0, 1});
+  auto schema = create_primitive_schema<float>();
+  ArrowChunkedArray ca(1, &arr, &schema);
+
+  auto it = ca.begin<double>();
+  ASSERT_TRUE(std::isnan(*it));
+  ASSERT_TRUE(std::isnan(*(++it)));
+  ASSERT_EQ(it[2], 4);
+  ASSERT_EQ(it[4], 6);
+
+  arr.release(&arr);
+}
diff --git a/tests/cpp_tests/test_byte_buffer.cpp b/tests/cpp_tests/test_byte_buffer.cpp
index 98df661ddd31..a6c2b660b983 100644
--- a/tests/cpp_tests/test_byte_buffer.cpp
+++ b/tests/cpp_tests/test_byte_buffer.cpp
@@ -30,7 +30,7 @@ TEST(ByteBuffer, JustWorks) {
   EXPECT_EQ(cumulativeSize, buffer->GetSize());
   int16_t serializedInt16 = 0;
   char* int16Ptr = reinterpret_cast<char*>(&serializedInt16);
-  for (int i = 0; i < sizeof(int16_t); i++) {
+  for (unsigned int i = 0; i < sizeof(int16_t); i++) {
     int16Ptr[i] = buffer->GetAt(cumulativeSize - (sizeof(int16_t) - i));
   }
   EXPECT_EQ(int16Val, serializedInt16);
@@ -41,7 +41,7 @@ TEST(ByteBuffer, JustWorks) {
   EXPECT_EQ(cumulativeSize, buffer->GetSize());
   int64_t serializedInt64 = 0;
   char* int64Ptr = reinterpret_cast<char*>(&serializedInt64);
-  for (int i = 0; i < sizeof(int64_t); i++) {
+  for (unsigned int i = 0; i < sizeof(int64_t); i++) {
     int64Ptr[i] = buffer->GetAt(cumulativeSize - (sizeof(int64_t) - i));
   }
   EXPECT_EQ(int64Val, serializedInt64);
@@ -52,7 +52,7 @@ TEST(ByteBuffer, JustWorks) {
   EXPECT_EQ(cumulativeSize, buffer->GetSize());
   double serializedDouble = 0;
   char* doublePtr = reinterpret_cast<char*>(&serializedDouble);
-  for (int i = 0; i < sizeof(double); i++) {
+  for (unsigned int i = 0; i < sizeof(double); i++) {
     doublePtr[i] = buffer->GetAt(cumulativeSize - (sizeof(double) - i));
   }
   EXPECT_EQ(doubleVal, serializedDouble);
diff --git a/tests/cpp_tests/testutils.cpp b/tests/cpp_tests/testutils.cpp
index f0b3e1c1f206..84acfe5b98a3 100644
--- a/tests/cpp_tests/testutils.cpp
+++ b/tests/cpp_tests/testutils.cpp
@@ -265,7 +265,7 @@ namespace LightGBM {
                      groups_ptr,
                      thread_count,
                      t);
-      threads.push_back(move(th));
+      threads.push_back(std::move(th));
     }
 
     for (auto& t : threads) t.join();
diff --git a/tests/python_package_test/test_arrow.py b/tests/python_package_test/test_arrow.py
new file mode 100644
index 000000000000..593c03d8c7ef
--- /dev/null
+++ b/tests/python_package_test/test_arrow.py
@@ -0,0 +1,387 @@
+# coding: utf-8
+import filecmp
+from typing import Any, Dict, Optional
+
+import numpy as np
+import pyarrow as pa
+import pytest
+
+import lightgbm as lgb
+
+from .utils import np_assert_array_equal
+
+# ----------------------------------------------------------------------------------------------- #
+#                                            UTILITIES                                            #
+# ----------------------------------------------------------------------------------------------- #
+
+_INTEGER_TYPES = [
+    pa.int8(),
+    pa.int16(),
+    pa.int32(),
+    pa.int64(),
+    pa.uint8(),
+    pa.uint16(),
+    pa.uint32(),
+    pa.uint64(),
+]
+_FLOAT_TYPES = [
+    pa.float32(),
+    pa.float64(),
+]
+
+
+def generate_simple_arrow_table(empty_chunks: bool = False) -> pa.Table:
+    c: list[list[int]] = [[]] if empty_chunks else []
+    columns = [
+        pa.chunked_array(c + [[1, 2, 3]] + c + [[4, 5]] + c, type=pa.uint8()),
+        pa.chunked_array(c + [[1, 2, 3]] + c + [[4, 5]] + c, type=pa.int8()),
+        pa.chunked_array(c + [[1, 2, 3]] + c + [[4, 5]] + c, type=pa.uint16()),
+        pa.chunked_array(c + [[1, 2, 3]] + c + [[4, 5]] + c, type=pa.int16()),
+        pa.chunked_array(c + [[1, 2, 3]] + c + [[4, 5]] + c, type=pa.uint32()),
+        pa.chunked_array(c + [[1, 2, 3]] + c + [[4, 5]] + c, type=pa.int32()),
+        pa.chunked_array(c + [[1, 2, 3]] + c + [[4, 5]] + c, type=pa.uint64()),
+        pa.chunked_array(c + [[1, 2, 3]] + c + [[4, 5]] + c, type=pa.int64()),
+        pa.chunked_array(c + [[1, 2, 3]] + c + [[4, 5]] + c, type=pa.float32()),
+        pa.chunked_array(c + [[1, 2, 3]] + c + [[4, 5]] + c, type=pa.float64()),
+    ]
+    return pa.Table.from_arrays(columns, names=[f"col_{i}" for i in range(len(columns))])
+
+
+def generate_nullable_arrow_table() -> pa.Table:
+    columns = [
+        pa.chunked_array([[1, None, 3, 4, 5]], type=pa.float32()),
+        pa.chunked_array([[None, 2, 3, 4, 5]], type=pa.float32()),
+        pa.chunked_array([[1, 2, 3, 4, None]], type=pa.float32()),
+        pa.chunked_array([[None, None, None, None, None]], type=pa.float32()),
+    ]
+    return pa.Table.from_arrays(columns, names=[f"col_{i}" for i in range(len(columns))])
+
+
+def generate_dummy_arrow_table() -> pa.Table:
+    col1 = pa.chunked_array([[1, 2, 3], [4, 5]], type=pa.uint8())
+    col2 = pa.chunked_array([[0.5, 0.6], [0.1, 0.8, 1.5]], type=pa.float32())
+    return pa.Table.from_arrays([col1, col2], names=["a", "b"])
+
+
+def generate_random_arrow_table(
+    num_columns: int,
+    num_datapoints: int,
+    seed: int,
+    generate_nulls: bool = True,
+    values: Optional[np.ndarray] = None,
+) -> pa.Table:
+    columns = [
+        generate_random_arrow_array(
+            num_datapoints, seed + i, generate_nulls=generate_nulls, values=values
+        )
+        for i in range(num_columns)
+    ]
+    names = [f"col_{i}" for i in range(num_columns)]
+    return pa.Table.from_arrays(columns, names=names)
+
+
+def generate_random_arrow_array(
+    num_datapoints: int,
+    seed: int,
+    generate_nulls: bool = True,
+    values: Optional[np.ndarray] = None,
+) -> pa.ChunkedArray:
+    generator = np.random.default_rng(seed)
+    data = (
+        generator.standard_normal(num_datapoints)
+        if values is None
+        else generator.choice(values, size=num_datapoints, replace=True)
+    )
+
+    # Set random nulls
+    if generate_nulls:
+        indices = generator.choice(len(data), size=num_datapoints // 10)
+        data[indices] = None
+
+    # Split data into <=2 random chunks
+    split_points = np.sort(generator.choice(np.arange(1, num_datapoints), 2, replace=False))
+    split_points = np.concatenate([[0], split_points, [num_datapoints]])
+    chunks = [data[split_points[i] : split_points[i + 1]] for i in range(len(split_points) - 1)]
+    chunks = [chunk for chunk in chunks if len(chunk) > 0]
+
+    # Turn chunks into array
+    return pa.chunked_array([data], type=pa.float32())
+
+
+def dummy_dataset_params() -> Dict[str, Any]:
+    return {
+        "min_data_in_bin": 1,
+        "min_data_in_leaf": 1,
+    }
+
+
+# ----------------------------------------------------------------------------------------------- #
+#                                            UNIT TESTS                                           #
+# ----------------------------------------------------------------------------------------------- #
+
+# ------------------------------------------- DATASET ------------------------------------------- #
+
+
+@pytest.mark.parametrize(
+    ("arrow_table_fn", "dataset_params"),
+    [  # Use lambda functions here to minimize memory consumption
+        (lambda: generate_simple_arrow_table(), dummy_dataset_params()),
+        (lambda: generate_simple_arrow_table(empty_chunks=True), dummy_dataset_params()),
+        (lambda: generate_dummy_arrow_table(), dummy_dataset_params()),
+        (lambda: generate_nullable_arrow_table(), dummy_dataset_params()),
+        (lambda: generate_random_arrow_table(3, 1000, 42), {}),
+        (lambda: generate_random_arrow_table(100, 10000, 43), {}),
+    ],
+)
+def test_dataset_construct_fuzzy(tmp_path, arrow_table_fn, dataset_params):
+    arrow_table = arrow_table_fn()
+
+    arrow_dataset = lgb.Dataset(arrow_table, params=dataset_params)
+    arrow_dataset.construct()
+
+    pandas_dataset = lgb.Dataset(arrow_table.to_pandas(), params=dataset_params)
+    pandas_dataset.construct()
+
+    arrow_dataset._dump_text(tmp_path / "arrow.txt")
+    pandas_dataset._dump_text(tmp_path / "pandas.txt")
+    assert filecmp.cmp(tmp_path / "arrow.txt", tmp_path / "pandas.txt")
+
+
+# -------------------------------------------- FIELDS ------------------------------------------- #
+
+
+def test_dataset_construct_fields_fuzzy():
+    arrow_table = generate_random_arrow_table(3, 1000, 42)
+    arrow_labels = generate_random_arrow_array(1000, 42, generate_nulls=False)
+    arrow_weights = generate_random_arrow_array(1000, 42, generate_nulls=False)
+    arrow_groups = pa.chunked_array([[300, 400, 50], [250]], type=pa.int32())
+
+    arrow_dataset = lgb.Dataset(
+        arrow_table, label=arrow_labels, weight=arrow_weights, group=arrow_groups
+    )
+    arrow_dataset.construct()
+
+    pandas_dataset = lgb.Dataset(
+        arrow_table.to_pandas(),
+        label=arrow_labels.to_numpy(),
+        weight=arrow_weights.to_numpy(),
+        group=arrow_groups.to_numpy(),
+    )
+    pandas_dataset.construct()
+
+    # Check for equality
+    for field in ("label", "weight", "group"):
+        np_assert_array_equal(
+            arrow_dataset.get_field(field), pandas_dataset.get_field(field), strict=True
+        )
+    np_assert_array_equal(arrow_dataset.get_label(), pandas_dataset.get_label(), strict=True)
+    np_assert_array_equal(arrow_dataset.get_weight(), pandas_dataset.get_weight(), strict=True)
+
+
+# -------------------------------------------- LABELS ------------------------------------------- #
+
+
+@pytest.mark.parametrize(
+    ["array_type", "label_data"],
+    [
+        (pa.array, [0, 1, 0, 0, 1]),
+        (pa.chunked_array, [[0], [1, 0, 0, 1]]),
+        (pa.chunked_array, [[], [0], [1, 0, 0, 1]]),
+        (pa.chunked_array, [[0], [], [1, 0], [], [], [0, 1], []]),
+    ],
+)
+@pytest.mark.parametrize("arrow_type", _INTEGER_TYPES + _FLOAT_TYPES)
+def test_dataset_construct_labels(array_type, label_data, arrow_type):
+    data = generate_dummy_arrow_table()
+    labels = array_type(label_data, type=arrow_type)
+    dataset = lgb.Dataset(data, label=labels, params=dummy_dataset_params())
+    dataset.construct()
+
+    expected = np.array([0, 1, 0, 0, 1], dtype=np.float32)
+    np_assert_array_equal(expected, dataset.get_label(), strict=True)
+
+
+# ------------------------------------------- WEIGHTS ------------------------------------------- #
+
+
+def test_dataset_construct_weights_none():
+    data = generate_dummy_arrow_table()
+    weight = pa.array([1, 1, 1, 1, 1])
+    dataset = lgb.Dataset(data, weight=weight, params=dummy_dataset_params())
+    dataset.construct()
+    assert dataset.get_weight() is None
+    assert dataset.get_field("weight") is None
+
+
+@pytest.mark.parametrize(
+    ["array_type", "weight_data"],
+    [
+        (pa.array, [3, 0.7, 1.5, 0.5, 0.1]),
+        (pa.chunked_array, [[3], [0.7, 1.5, 0.5, 0.1]]),
+        (pa.chunked_array, [[], [3], [0.7, 1.5, 0.5, 0.1]]),
+        (pa.chunked_array, [[3], [0.7], [], [], [1.5, 0.5, 0.1], []]),
+    ],
+)
+@pytest.mark.parametrize("arrow_type", _FLOAT_TYPES)
+def test_dataset_construct_weights(array_type, weight_data, arrow_type):
+    data = generate_dummy_arrow_table()
+    weights = array_type(weight_data, type=arrow_type)
+    dataset = lgb.Dataset(data, weight=weights, params=dummy_dataset_params())
+    dataset.construct()
+
+    expected = np.array([3, 0.7, 1.5, 0.5, 0.1], dtype=np.float32)
+    np_assert_array_equal(expected, dataset.get_weight(), strict=True)
+
+
+# -------------------------------------------- GROUPS ------------------------------------------- #
+
+
+@pytest.mark.parametrize(
+    ["array_type", "group_data"],
+    [
+        (pa.array, [2, 3]),
+        (pa.chunked_array, [[2], [3]]),
+        (pa.chunked_array, [[], [2, 3]]),
+        (pa.chunked_array, [[2], [], [3], []]),
+    ],
+)
+@pytest.mark.parametrize("arrow_type", _INTEGER_TYPES)
+def test_dataset_construct_groups(array_type, group_data, arrow_type):
+    data = generate_dummy_arrow_table()
+    groups = array_type(group_data, type=arrow_type)
+    dataset = lgb.Dataset(data, group=groups, params=dummy_dataset_params())
+    dataset.construct()
+
+    expected = np.array([0, 2, 5], dtype=np.int32)
+    np_assert_array_equal(expected, dataset.get_field("group"), strict=True)
+
+
+# ----------------------------------------- INIT SCORES ----------------------------------------- #
+
+
+@pytest.mark.parametrize(
+    ["array_type", "init_score_data"],
+    [
+        (pa.array, [0, 1, 2, 3, 3]),
+        (pa.chunked_array, [[0, 1, 2], [3, 3]]),
+        (pa.chunked_array, [[], [0, 1, 2], [3, 3]]),
+        (pa.chunked_array, [[0, 1], [], [], [2], [3, 3], []]),
+    ],
+)
+@pytest.mark.parametrize("arrow_type", _INTEGER_TYPES + _FLOAT_TYPES)
+def test_dataset_construct_init_scores_array(
+    array_type: Any, init_score_data: Any, arrow_type: Any
+):
+    data = generate_dummy_arrow_table()
+    init_scores = array_type(init_score_data, type=arrow_type)
+    dataset = lgb.Dataset(data, init_score=init_scores, params=dummy_dataset_params())
+    dataset.construct()
+
+    expected = np.array([0, 1, 2, 3, 3], dtype=np.float64)
+    np_assert_array_equal(expected, dataset.get_init_score(), strict=True)
+
+
+def test_dataset_construct_init_scores_table():
+    data = generate_dummy_arrow_table()
+    init_scores = pa.Table.from_arrays(
+        [
+            generate_random_arrow_array(5, seed=1, generate_nulls=False),
+            generate_random_arrow_array(5, seed=2, generate_nulls=False),
+            generate_random_arrow_array(5, seed=3, generate_nulls=False),
+        ],
+        names=["a", "b", "c"],
+    )
+    dataset = lgb.Dataset(data, init_score=init_scores, params=dummy_dataset_params())
+    dataset.construct()
+
+    actual = dataset.get_init_score()
+    expected = init_scores.to_pandas().to_numpy().astype(np.float64)
+    np_assert_array_equal(expected, actual, strict=True)
+
+
+# ------------------------------------------ PREDICTION ----------------------------------------- #
+
+
+def assert_equal_predict_arrow_pandas(booster: lgb.Booster, data: pa.Table):
+    p_arrow = booster.predict(data)
+    p_pandas = booster.predict(data.to_pandas())
+    np_assert_array_equal(p_arrow, p_pandas, strict=True)
+
+    p_raw_arrow = booster.predict(data, raw_score=True)
+    p_raw_pandas = booster.predict(data.to_pandas(), raw_score=True)
+    np_assert_array_equal(p_raw_arrow, p_raw_pandas, strict=True)
+
+    p_leaf_arrow = booster.predict(data, pred_leaf=True)
+    p_leaf_pandas = booster.predict(data.to_pandas(), pred_leaf=True)
+    np_assert_array_equal(p_leaf_arrow, p_leaf_pandas, strict=True)
+
+    p_pred_contrib_arrow = booster.predict(data, pred_contrib=True)
+    p_pred_contrib_pandas = booster.predict(data.to_pandas(), pred_contrib=True)
+    np_assert_array_equal(p_pred_contrib_arrow, p_pred_contrib_pandas, strict=True)
+
+    p_first_iter_arrow = booster.predict(data, start_iteration=0, num_iteration=1, raw_score=True)
+    p_first_iter_pandas = booster.predict(
+        data.to_pandas(), start_iteration=0, num_iteration=1, raw_score=True
+    )
+    np_assert_array_equal(p_first_iter_arrow, p_first_iter_pandas, strict=True)
+
+
+def test_predict_regression():
+    data = generate_random_arrow_table(10, 10000, 42)
+    dataset = lgb.Dataset(
+        data,
+        label=generate_random_arrow_array(10000, 43, generate_nulls=False),
+        params=dummy_dataset_params(),
+    )
+    booster = lgb.train(
+        {"objective": "regression", "num_leaves": 7},
+        dataset,
+        num_boost_round=5,
+    )
+    assert_equal_predict_arrow_pandas(booster, data)
+
+
+def test_predict_binary_classification():
+    data = generate_random_arrow_table(10, 10000, 42)
+    dataset = lgb.Dataset(
+        data,
+        label=generate_random_arrow_array(10000, 43, generate_nulls=False, values=np.arange(2)),
+        params=dummy_dataset_params(),
+    )
+    booster = lgb.train(
+        {"objective": "binary", "num_leaves": 7},
+        dataset,
+        num_boost_round=5,
+    )
+    assert_equal_predict_arrow_pandas(booster, data)
+
+
+def test_predict_multiclass_classification():
+    data = generate_random_arrow_table(10, 10000, 42)
+    dataset = lgb.Dataset(
+        data,
+        label=generate_random_arrow_array(10000, 43, generate_nulls=False, values=np.arange(5)),
+        params=dummy_dataset_params(),
+    )
+    booster = lgb.train(
+        {"objective": "multiclass", "num_leaves": 7, "num_class": 5},
+        dataset,
+        num_boost_round=5,
+    )
+    assert_equal_predict_arrow_pandas(booster, data)
+
+
+def test_predict_ranking():
+    data = generate_random_arrow_table(10, 10000, 42)
+    dataset = lgb.Dataset(
+        data,
+        label=generate_random_arrow_array(10000, 43, generate_nulls=False, values=np.arange(4)),
+        group=np.array([1000, 2000, 3000, 4000]),
+        params=dummy_dataset_params(),
+    )
+    booster = lgb.train(
+        {"objective": "lambdarank", "num_leaves": 7},
+        dataset,
+        num_boost_round=5,
+    )
+    assert_equal_predict_arrow_pandas(booster, data)
diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py
index 7f8980c271f7..b8ef43e41397 100644
--- a/tests/python_package_test/test_basic.py
+++ b/tests/python_package_test/test_basic.py
@@ -15,7 +15,7 @@
 import lightgbm as lgb
 from lightgbm.compat import PANDAS_INSTALLED, pd_DataFrame, pd_Series
 
-from .utils import dummy_obj, load_breast_cancer, mse_obj
+from .utils import dummy_obj, load_breast_cancer, mse_obj, np_assert_array_equal
 
 
 def test_basic(tmp_path):
@@ -499,6 +499,94 @@ def check_asserts(data):
     check_asserts(lgb_data)
 
 
+def test_dataset_construction_overwrites_user_provided_metadata_fields():
+
+    X = np.array([[1.0, 2.0], [3.0, 4.0]])
+
+    position = np.array([0.0, 1.0], dtype=np.float32)
+    if getenv('TASK', '') == 'cuda':
+        position = None
+
+    dtrain = lgb.Dataset(
+        X,
+        params={
+            "min_data_in_bin": 1,
+            "min_data_in_leaf": 1,
+            "verbosity": -1
+        },
+        group=[1, 1],
+        init_score=[0.312, 0.708],
+        label=[1, 2],
+        position=position,
+        weight=[0.5, 1.5],
+    )
+
+    # unconstructed, get_* methods should return whatever was provided
+    assert dtrain.group == [1, 1]
+    assert dtrain.get_group() == [1, 1]
+    assert dtrain.init_score == [0.312, 0.708]
+    assert dtrain.get_init_score() == [0.312, 0.708]
+    assert dtrain.label == [1, 2]
+    assert dtrain.get_label() == [1, 2]
+    if getenv('TASK', '') != 'cuda':
+        np_assert_array_equal(
+            dtrain.position,
+            np.array([0.0, 1.0], dtype=np.float32),
+            strict=True
+        )
+        np_assert_array_equal(
+            dtrain.get_position(),
+            np.array([0.0, 1.0], dtype=np.float32),
+            strict=True
+        )
+    assert dtrain.weight == [0.5, 1.5]
+    assert dtrain.get_weight() == [0.5, 1.5]
+
+    # before construction, get_field() should raise an exception
+    for field_name in ["group", "init_score", "label", "position", "weight"]:
+        with pytest.raises(Exception, match=f"Cannot get {field_name} before construct Dataset"):
+            dtrain.get_field(field_name)
+
+    # constructed, get_* methods should return numpy arrays, even when the provided
+    # input was a list of floats or ints
+    dtrain.construct()
+    expected_group = np.array([1, 1], dtype=np.int32)
+    np_assert_array_equal(dtrain.group, expected_group, strict=True)
+    np_assert_array_equal(dtrain.get_group(), expected_group, strict=True)
+    # get_field("group") returns a numpy array with boundaries, instead of size
+    np_assert_array_equal(
+        dtrain.get_field("group"),
+        np.array([0, 1, 2], dtype=np.int32),
+        strict=True
+    )
+
+    expected_init_score = np.array([0.312, 0.708],)
+    np_assert_array_equal(dtrain.init_score, expected_init_score, strict=True)
+    np_assert_array_equal(dtrain.get_init_score(), expected_init_score, strict=True)
+    np_assert_array_equal(dtrain.get_field("init_score"), expected_init_score, strict=True)
+
+    expected_label = np.array([1, 2], dtype=np.float32)
+    np_assert_array_equal(dtrain.label, expected_label, strict=True)
+    np_assert_array_equal(dtrain.get_label(), expected_label, strict=True)
+    np_assert_array_equal(dtrain.get_field("label"), expected_label, strict=True)
+
+    if getenv('TASK', '') != 'cuda':
+        expected_position = np.array([0.0, 1.0], dtype=np.float32)
+        np_assert_array_equal(dtrain.position, expected_position, strict=True)
+        np_assert_array_equal(dtrain.get_position(), expected_position, strict=True)
+        # NOTE: "position" is converted to int32 on the C++ side
+        np_assert_array_equal(
+            dtrain.get_field("position"),
+            np.array([0.0, 1.0], dtype=np.int32),
+            strict=True
+        )
+
+    expected_weight = np.array([0.5, 1.5], dtype=np.float32)
+    np_assert_array_equal(dtrain.weight, expected_weight, strict=True)
+    np_assert_array_equal(dtrain.get_weight(), expected_weight, strict=True)
+    np_assert_array_equal(dtrain.get_field("weight"), expected_weight, strict=True)
+
+
 def test_choose_param_value():
 
     original_params = {
@@ -734,21 +822,34 @@ def test_no_copy_when_single_float_dtype_dataframe(dtype, feature_name):
 
 
 @pytest.mark.parametrize('feature_name', [['x1'], [42], 'auto'])
-def test_categorical_code_conversion_doesnt_modify_original_data(feature_name):
+@pytest.mark.parametrize('categories', ['seen', 'unseen'])
+def test_categorical_code_conversion_doesnt_modify_original_data(feature_name, categories):
     pd = pytest.importorskip('pandas')
     X = np.random.choice(['a', 'b'], 100).reshape(-1, 1)
     column_name = 'a' if feature_name == 'auto' else feature_name[0]
     df = pd.DataFrame(X.copy(), columns=[column_name], dtype='category')
+    if categories == 'seen':
+        pandas_categorical = [['a', 'b']]
+    else:
+        pandas_categorical = [['a']]
     data = lgb.basic._data_from_pandas(
         data=df,
         feature_name=feature_name,
         categorical_feature="auto",
-        pandas_categorical=None
+        pandas_categorical=pandas_categorical,
     )[0]
     # check that the original data wasn't modified
     np.testing.assert_equal(df[column_name], X[:, 0])
     # check that the built data has the codes
-    np.testing.assert_equal(df[column_name].cat.codes, data[:, 0])
+    if categories == 'seen':
+        # if all categories were seen during training we just take the codes
+        codes = df[column_name].cat.codes
+    else:
+        # if we only saw 'a' during training we just replace its code
+        # and leave the rest as nan
+        a_code = df[column_name].cat.categories.get_loc('a')
+        codes = np.where(df[column_name] == 'a', a_code, np.nan)
+    np.testing.assert_equal(codes, data[:, 0])
 
 
 @pytest.mark.parametrize('min_data_in_bin', [2, 10])
diff --git a/tests/python_package_test/test_callback.py b/tests/python_package_test/test_callback.py
index cb5dc707bf43..f93ca837f8b9 100644
--- a/tests/python_package_test/test_callback.py
+++ b/tests/python_package_test/test_callback.py
@@ -21,6 +21,17 @@ def test_early_stopping_callback_is_picklable(serializer):
     assert callback.stopping_rounds == rounds
 
 
+def test_early_stopping_callback_rejects_invalid_stopping_rounds_with_informative_errors():
+    with pytest.raises(ValueError, match="stopping_rounds should be an integer and greater than 0. got: 0"):
+        lgb.early_stopping(stopping_rounds=0)
+
+    with pytest.raises(ValueError, match="stopping_rounds should be an integer and greater than 0. got: -1"):
+        lgb.early_stopping(stopping_rounds=-1)
+
+    with pytest.raises(ValueError, match="stopping_rounds should be an integer and greater than 0. got: neverrrr"):
+        lgb.early_stopping(stopping_rounds="neverrrr")
+
+
 @pytest.mark.parametrize('serializer', SERIALIZERS)
 def test_log_evaluation_callback_is_picklable(serializer):
     periods = 42
diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
index b46526bcfaf6..e355e5ab074a 100644
--- a/tests/python_package_test/test_engine.py
+++ b/tests/python_package_test/test_engine.py
@@ -755,7 +755,7 @@ def test_ranking_prediction_early_stopping():
 # (in our example it is simply the ordering by some feature correlated with relevance, e.g., 34)
 # and clicks on that document (new_label=1) with some probability 'pclick' depending on its true relevance;
 # at each position the user may stop the traversal with some probability pstop. For the non-clicked documents,
-# new_label=0. Thus the generated new labels are biased towards the baseline ranker. 
+# new_label=0. Thus the generated new labels are biased towards the baseline ranker.
 # The positions of the documents in the ranked lists produced by the baseline, are returned.
 def simulate_position_bias(file_dataset_in, file_query_in, file_dataset_out, baseline_feature):
     # a mapping of a document's true relevance (defined on a 5-grade scale) into the probability of clicking it
@@ -772,7 +772,7 @@ def get_pclick(label):
             return 0.9
     # an instantiation of a cascade model where the user stops with probability 0.2 after observing each document
     pstop = 0.2
- 
+
     f_dataset_in = open(file_dataset_in, 'r')
     f_dataset_out = open(file_dataset_out, 'w')
     random.seed(10)
@@ -780,19 +780,19 @@ def get_pclick(label):
     for line in open(file_query_in):
         docs_num = int (line)
         lines = []
-        index_values = []    
+        index_values = []
         positions = [0] * docs_num
         for index in range(docs_num):
             features = f_dataset_in.readline().split()
             lines.append(features)
             val = 0.0
             for feature_val in features:
-                feature_val_split = feature_val.split(":")           
+                feature_val_split = feature_val.split(":")
                 if int(feature_val_split[0]) == baseline_feature:
                     val = float(feature_val_split[1])
             index_values.append([index, val])
         index_values.sort(key=lambda x: -x[1])
-        stop = False 
+        stop = False
         for pos in range(docs_num):
             index = index_values[pos][0]
             new_label = 0
@@ -800,7 +800,7 @@ def get_pclick(label):
                 label = int(lines[index][0])
                 pclick = get_pclick(label)
                 if random.random() < pclick:
-                    new_label = 1       
+                    new_label = 1
                 stop = random.random() < pstop
             lines[index][0] = str(new_label)
             positions[index] = pos
@@ -843,7 +843,7 @@ def test_ranking_with_position_information_with_file(tmp_path):
     lgb_train = lgb.Dataset(str(tmp_path / 'rank.train'), params=params)
     lgb_valid = [lgb_train.create_valid(str(tmp_path / 'rank.test'))]
     gbm_unbiased_with_file = lgb.train(params, lgb_train, valid_sets = lgb_valid, num_boost_round=50)
-    
+
     # the performance of the unbiased LambdaMART should outperform the plain LambdaMART on the dataset with position bias
     assert gbm_baseline.best_score['valid_0']['ndcg@3'] + 0.03 <= gbm_unbiased_with_file.best_score['valid_0']['ndcg@3']
 
@@ -853,7 +853,7 @@ def test_ranking_with_position_information_with_file(tmp_path):
         file.close()
     lgb_train = lgb.Dataset(str(tmp_path / 'rank.train'), params=params)
     lgb_valid = [lgb_train.create_valid(str(tmp_path / 'rank.test'))]
-    with pytest.raises(lgb.basic.LightGBMError, match="Positions size \(3006\) doesn't match data size"):
+    with pytest.raises(lgb.basic.LightGBMError, match=r"Positions size \(3006\) doesn't match data size"):
         lgb.train(params, lgb_train, valid_sets = lgb_valid, num_boost_round=50)
 
 
@@ -1470,7 +1470,7 @@ def test_feature_name_with_non_ascii():
     assert feature_names == gbm2.feature_name()
 
 
-def test_parameters_are_loaded_from_model_file(tmp_path):
+def test_parameters_are_loaded_from_model_file(tmp_path, capsys):
     X = np.hstack([np.random.rand(100, 1), np.random.randint(0, 5, (100, 2))])
     y = np.random.rand(100)
     ds = lgb.Dataset(X, y)
@@ -1487,8 +1487,18 @@ def test_parameters_are_loaded_from_model_file(tmp_path):
         'num_threads': 1,
     }
     model_file = tmp_path / 'model.txt'
-    lgb.train(params, ds, num_boost_round=1, categorical_feature=[1, 2]).save_model(model_file)
+    orig_bst = lgb.train(params, ds, num_boost_round=1, categorical_feature=[1, 2])
+    orig_bst.save_model(model_file)
+    with model_file.open('rt') as f:
+        model_contents = f.readlines()
+    params_start = model_contents.index('parameters:\n')
+    model_contents.insert(params_start + 1, '[max_conflict_rate: 0]\n')
+    with model_file.open('wt') as f:
+        f.writelines(model_contents)
     bst = lgb.Booster(model_file=model_file)
+    expected_msg = "[LightGBM] [Warning] Ignoring unrecognized parameter 'max_conflict_rate' found in model string."
+    stdout = capsys.readouterr().out
+    assert expected_msg in stdout
     set_params = {k: bst.params[k] for k in params.keys()}
     assert set_params == params
     assert bst.params['categorical_feature'] == [1, 2]
@@ -1498,6 +1508,11 @@ def test_parameters_are_loaded_from_model_file(tmp_path):
         bst2 = lgb.Booster(params={'num_leaves': 7}, model_file=model_file)
     assert bst.params == bst2.params
 
+    # check inference isn't affected by unknown parameter
+    orig_preds = orig_bst.predict(X)
+    preds = bst.predict(X)
+    np.testing.assert_allclose(preds, orig_preds)
+
 
 def test_save_load_copy_pickle():
     def train_and_predict(init_model=None, return_model=False):
@@ -4501,9 +4516,9 @@ def test_train_raises_informative_error_if_any_valid_sets_are_not_dataset_object
 
 def test_train_raises_informative_error_for_params_of_wrong_type():
     X, y = make_synthetic_regression()
-    params = {"early_stopping_round": "too-many"}
+    params = {"num_leaves": "too-many"}
     dtrain = lgb.Dataset(X, label=y)
-    with pytest.raises(lgb.basic.LightGBMError, match="Parameter early_stopping_round should be of type int, got \"too-many\""):
+    with pytest.raises(lgb.basic.LightGBMError, match="Parameter num_leaves should be of type int, got \"too-many\""):
         lgb.train(params, dtrain)
 
 
diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py
index 9c6036ad510c..f54fe5236d79 100644
--- a/tests/python_package_test/test_sklearn.py
+++ b/tests/python_package_test/test_sklearn.py
@@ -640,11 +640,12 @@ def test_non_serializable_objects_in_callbacks(tmp_path):
     assert gbm.booster_.attr_set_inside_callback == 40
 
 
-def test_random_state_object():
+@pytest.mark.parametrize("rng_constructor", [np.random.RandomState, np.random.default_rng])
+def test_random_state_object(rng_constructor):
     X, y = load_iris(return_X_y=True)
     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
-    state1 = np.random.RandomState(123)
-    state2 = np.random.RandomState(123)
+    state1 = rng_constructor(123)
+    state2 = rng_constructor(123)
     clf1 = lgb.LGBMClassifier(n_estimators=10, subsample=0.5, subsample_freq=1, random_state=state1)
     clf2 = lgb.LGBMClassifier(n_estimators=10, subsample=0.5, subsample_freq=1, random_state=state2)
     # Test if random_state is properly stored
diff --git a/tests/python_package_test/utils.py b/tests/python_package_test/utils.py
index df01e29852e7..7eae62b14369 100644
--- a/tests/python_package_test/utils.py
+++ b/tests/python_package_test/utils.py
@@ -1,6 +1,7 @@
 # coding: utf-8
 import pickle
 from functools import lru_cache
+from inspect import getfullargspec
 
 import cloudpickle
 import joblib
@@ -193,3 +194,22 @@ def pickle_and_unpickle_object(obj, serializer):
             serializer=serializer
         )
     return obj_from_disk  # noqa: RET504
+
+
+# doing this here, at import time, to ensure it only runs once_per import
+# instead of once per assertion
+_numpy_testing_supports_strict_kwarg = (
+    "strict" in getfullargspec(np.testing.assert_array_equal).kwonlyargs
+)
+
+
+def np_assert_array_equal(*args, **kwargs):
+    """
+    np.testing.assert_array_equal() only got the kwarg ``strict`` in June 2022:
+    https://github.com/numpy/numpy/pull/21595
+
+    This function is here for testing on older Python (and therefore ``numpy``)
+    """
+    if not _numpy_testing_supports_strict_kwarg:
+        kwargs.pop("strict")
+    np.testing.assert_array_equal(*args, **kwargs)
diff --git a/windows/LightGBM.vcxproj b/windows/LightGBM.vcxproj
index 96fe017e96b8..b7848b4cab5a 100644
--- a/windows/LightGBM.vcxproj
+++ b/windows/LightGBM.vcxproj
@@ -343,6 +343,7 @@
     <ClCompile Include="..\src\treelearner\tree_learner.cpp" />
     <ClCompile Include="..\src\treelearner\voting_parallel_tree_learner.cpp" />
     <ClCompile Include="..\src\treelearner\gradient_discretizer.cpp" />
+    <ClCompile Include="..\src\utils\openmp_wrapper.cpp" />
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
diff --git a/windows/LightGBM.vcxproj.filters b/windows/LightGBM.vcxproj.filters
index 27b445893c0f..7010926799b7 100644
--- a/windows/LightGBM.vcxproj.filters
+++ b/windows/LightGBM.vcxproj.filters
@@ -344,5 +344,8 @@
     <ClCompile Include="..\src\treelearner\gradient_discretizer.cpp">
       <Filter>src\treelearner</Filter>
     </ClCompile>
+    <ClCompile Include="..\src\utils\openmp_wrapper.cpp">
+      <Filter>src\utils</Filter>
+    </ClCompile>
   </ItemGroup>
 </Project>
\ No newline at end of file