Testcases: Down adjust L2/L3 cache size for GPU targets

antonysigma · antonysigma · commit de0a19571549 · 2024-09-05T17:58:07.000-07:00
diff --git a/apps/bilateral_grid/CMakeLists.txt b/apps/bilateral_grid/CMakeLists.txt
@@ -25,7 +25,11 @@ add_halide_library(bilateral_grid_auto_schedule FROM bilateral_grid.generator
                    GENERATOR bilateral_grid
                    STMT bilateral_grid_auto_schedule_STMT
                    SCHEDULE bilateral_grid_auto_schedule_SCHEDULE
-                   AUTOSCHEDULER Halide::Mullapudi2016)
+                   AUTOSCHEDULER Halide::Mullapudi2016
+                   # When target=host-cuda or host-metal, limit the GPU shared
+                   # memory per block to avoid gpu kernel launch failure.
+                   PARAMS autoscheduler.last_level_cache_size=20000
+                   )
 
 # Main executable
 add_executable(bilateral_grid_process filter.cpp)
diff --git a/apps/local_laplacian/CMakeLists.txt b/apps/local_laplacian/CMakeLists.txt
@@ -20,7 +20,11 @@ add_halide_generator(local_laplacian.generator
 add_halide_library(local_laplacian FROM local_laplacian.generator)
 add_halide_library(local_laplacian_auto_schedule FROM local_laplacian.generator
                    GENERATOR local_laplacian
-                   AUTOSCHEDULER Halide::Mullapudi2016)
+                   AUTOSCHEDULER Halide::Mullapudi2016
+                   # When target=host-cuda or host-metal, limit the GPU shared
+                   # memory per block to avoid gpu kernel launch failure.
+                   PARAMS autoscheduler.last_level_cache_size=30000
+                   )
 
 # Main executable
 add_executable(local_laplacian_process process.cpp)
diff --git a/apps/stencil_chain/CMakeLists.txt b/apps/stencil_chain/CMakeLists.txt
@@ -18,7 +18,11 @@ add_halide_generator(stencil_chain.generator SOURCES stencil_chain_generator.cpp
 add_halide_library(stencil_chain FROM stencil_chain.generator)
 add_halide_library(stencil_chain_auto_schedule FROM stencil_chain.generator
                    GENERATOR stencil_chain
-                   AUTOSCHEDULER Halide::Mullapudi2016)
+                   AUTOSCHEDULER Halide::Mullapudi2016
+                   # When target=host-cuda or host-metal, limit the GPU shared
+                   # memory per block to avoid gpu kernel launch failure.
+                   PARAMS autoscheduler.last_level_cache_size=15000
+                   )
 
 # Main executable
 add_executable(stencil_chain_process process.cpp)
diff --git a/src/autoschedulers/mullapudi2016/AutoSchedule.cpp b/src/autoschedulers/mullapudi2016/AutoSchedule.cpp
@@ -42,7 +42,7 @@ struct ArchParams {
      * CACHE_SIZE to 48 KB.
      */
     constexpr ArchParams(bool has_gpu_feature)
-        : parallelism(has_gpu_feature ? 128 : 16), last_level_cache_size(has_gpu_feature ? 48 * 1024 : 16 * 1024 * 1024),
+        : parallelism(has_gpu_feature ? 128 : 16), last_level_cache_size(has_gpu_feature ? 35 * 1024 : 16 * 1024 * 1024),
           balance(has_gpu_feature ? 20 : 40) {
     }
 };

Original file line number	Diff line number	Diff line change
`@@ -42,7 +42,7 @@ struct ArchParams {`
`42`	`42`	`* CACHE_SIZE to 48 KB.`
`43`	`43`	`*/`
`44`	`44`	`constexpr ArchParams(bool has_gpu_feature)`
`45`		`- : parallelism(has_gpu_feature ? 128 : 16), last_level_cache_size(has_gpu_feature ? 48 * 1024 : 16 * 1024 * 1024),`
	`45`	`+ : parallelism(has_gpu_feature ? 128 : 16), last_level_cache_size(has_gpu_feature ? 35 * 1024 : 16 * 1024 * 1024),`
`46`	`46`	`balance(has_gpu_feature ? 20 : 40) {`
`47`	`47`	`}`
`48`	`48`	`};`