diff --git a/include/LightGBM/cuda/cuda_tree.hpp b/include/LightGBM/cuda/cuda_tree.hpp
index f3b92f52b385..5a25f4942fe0 100644
--- a/include/LightGBM/cuda/cuda_tree.hpp
+++ b/include/LightGBM/cuda/cuda_tree.hpp
@@ -99,7 +99,7 @@ class CUDATree : public Tree {
 
   double* cuda_leaf_value_ref() const { return cuda_leaf_value_; }
 
-  int host_leaf_depth(int leaf_index) { 
+  int host_leaf_depth(int leaf_index) {
     if (leaf_index >= 0 && leaf_index < num_leaves_) {
       return host_leaf_depth_[leaf_index];
     } else {
diff --git a/src/treelearner/cuda/cuda_best_split_finder.cpp b/src/treelearner/cuda/cuda_best_split_finder.cpp
index 52ad9d8dca69..1546a2035356 100644
--- a/src/treelearner/cuda/cuda_best_split_finder.cpp
+++ b/src/treelearner/cuda/cuda_best_split_finder.cpp
@@ -331,7 +331,7 @@ void CUDABestSplitFinder::FindBestSplitsForLeaf(
   const data_size_t num_data_in_larger_leaf,
   const double sum_hessians_in_smaller_leaf,
   const double sum_hessians_in_larger_leaf,
-  const int small_leaf_depth,
+  const int smaller_leaf_depth,
   const int larger_leaf_depth,
   const score_t* grad_scale,
   const score_t* hess_scale,
@@ -339,10 +339,10 @@ void CUDABestSplitFinder::FindBestSplitsForLeaf(
   const uint8_t larger_num_bits_in_histogram_bins) {
   const bool is_smaller_leaf_valid = (num_data_in_smaller_leaf > min_data_in_leaf_ &&
     sum_hessians_in_smaller_leaf > min_sum_hessian_in_leaf_ &&
-    (max_depth > 0 && smaller_leaf_depth > 0 && smaller_leaf_depth < max_depth));
+    ((max_depth_ > 0 && smaller_leaf_depth > 0 && smaller_leaf_depth <= max_depth_) || (max_depth_ <= 0)));
   const bool is_larger_leaf_valid = (num_data_in_larger_leaf > min_data_in_leaf_ &&
     sum_hessians_in_larger_leaf > min_sum_hessian_in_leaf_ && larger_leaf_index >= 0 &&
-    (max_depth > 0 && larger_leaf_depth > 0 && larger_leaf_depth < max_depth));
+    ((max_depth_ > 0 && larger_leaf_depth > 0 && larger_leaf_depth <= max_depth_) || (max_depth_ <= 0)));
   if (grad_scale != nullptr && hess_scale != nullptr) {
     LaunchFindBestSplitsDiscretizedForLeafKernel(smaller_leaf_splits, larger_leaf_splits,
       smaller_leaf_index, larger_leaf_index, is_smaller_leaf_valid, is_larger_leaf_valid,
diff --git a/src/treelearner/cuda/cuda_best_split_finder.hpp b/src/treelearner/cuda/cuda_best_split_finder.hpp
index 09a70ddff05d..e4d29a00a47b 100644
--- a/src/treelearner/cuda/cuda_best_split_finder.hpp
+++ b/src/treelearner/cuda/cuda_best_split_finder.hpp
@@ -68,6 +68,8 @@ class CUDABestSplitFinder {
     const data_size_t num_data_in_larger_leaf,
     const double sum_hessians_in_smaller_leaf,
     const double sum_hessians_in_larger_leaf,
+    const int smaller_leaf_depth,
+    const int larger_leaf_depth,
     const score_t* grad_scale,
     const score_t* hess_scale,
     const uint8_t smaller_num_bits_in_histogram_bins,
diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py
index eadc1496e99a..9cc2c9e418b8 100644
--- a/tests/python_package_test/test_basic.py
+++ b/tests/python_package_test/test_basic.py
@@ -913,6 +913,41 @@ def test_feature_names_are_set_correctly_when_no_feature_names_passed_into_Datas
     assert ds.construct().feature_name == ["Column_0", "Column_1", "Column_2"]
 
 
+def test_max_depth_is_enforced(capsys):
+    params = {
+        "objective": "binary",
+        "min_data": 10,
+        "num_leaves": 15,
+        "verbose": -1,
+        "num_threads": 1,
+        "max_bin": 255,
+        "gpu_use_dp": True,
+        "deterministic": True,
+        "random_state": 2,
+    }
+    X, y = make_blobs(n_samples=1_000, n_features=1, centers=2, random_state=2)
+    model = lgb.LGBMRegressor(**params)
+    model.fit(X, y)
+    fitted_max_depth = (
+        model.booster_.trees_to_dataframe().groupby("tree_index")["node_depth"].max().value_counts().index.max()
+    )
+    assert fitted_max_depth == 9, (
+        "This data generation and model fitting procedure should be deterministic within backends. "
+        "Both cpu and cuda should result in models with maximal tree depth 9."
+    )
+    # set a constraining value of max_depth, i.e. lower than 9
+    constrained_model = lgb.LGBMRegressor(max_depth=6, **params)
+    constrained_model.fit(X, y)
+    assert (
+        constrained_model.booster_.trees_to_dataframe()
+        .groupby("tree_index")["node_depth"]
+        .max()
+        .value_counts()
+        .index.max()
+        <= 7
+    ), "Trained model contains splits deeper than max_depth = 6"
+
+
 # NOTE: this intentionally contains values where num_leaves <, ==, and > (max_depth^2)
 @pytest.mark.parametrize(("max_depth", "num_leaves"), [(-1, 3), (-1, 50), (5, 3), (5, 31), (5, 32), (8, 3), (8, 31)])
 def test_max_depth_warning_is_not_raised_if_num_leaves_is_also_provided(capsys, num_leaves, max_depth):