Skip to content

Commit e476238

Browse files
chhwangBinyang2014
andauthored
Update CK GeMM (#200)
* Upgrade CK to tag `rocm-6.0.0` * Fix potential throughput bugs * More robust correctness tests --------- Co-authored-by: Binyang Li <[email protected]>
1 parent f6531f0 commit e476238

19 files changed

+1774
-1335
lines changed

.azure-pipelines/ut.yml

+2-1
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ jobs:
4444
script: |
4545
git submodule foreach --recursive git reset --hard
4646
git submodule foreach --recursive git clean -fdx
47+
git submodule foreach git fetch
4748
git submodule update --init --recursive
4849
4950
- task: Bash@3
@@ -63,7 +64,7 @@ jobs:
6364
inputs:
6465
targetType: 'inline'
6566
script: |
66-
cd build && ARK_ROOT=$PWD ctest --stop-on-failure --verbose --schedule-random
67+
cd build && ARK_ROOT=$PWD ARK_IGNORE_BINARY_CACHE=1 ctest --stop-on-failure --verbose --schedule-random
6768
workingDirectory: '$(System.DefaultWorkingDirectory)'
6869

6970
- task: Bash@3

.github/workflows/ut-cuda.yml

+2-1
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ jobs:
4141
git config --global --add safe.directory /__w/ark/ark
4242
git submodule foreach --recursive git reset --hard
4343
git submodule foreach --recursive git clean -fdx
44+
git submodule foreach git fetch
4445
git submodule update --init --recursive
4546
4647
- name: Build
@@ -51,7 +52,7 @@ jobs:
5152
5253
- name: RunUT
5354
run: |
54-
cd build && ARK_ROOT=$PWD ctest --stop-on-failure --verbose --schedule-random
55+
cd build && ARK_ROOT=$PWD ARK_IGNORE_BINARY_CACHE=1 ctest --stop-on-failure --verbose --schedule-random
5556
5657
- name: ReportCoverage
5758
run: |

.github/workflows/ut-rocm.yml

+2-1
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ jobs:
3333
git config --global --add safe.directory /__w/ark/ark
3434
git submodule foreach --recursive git reset --hard
3535
git submodule foreach --recursive git clean -fdx
36+
git submodule foreach git fetch
3637
git submodule update --init --recursive
3738
3839
- name: Build
@@ -43,7 +44,7 @@ jobs:
4344
4445
- name: RunUT
4546
run: |
46-
cd build && ARK_ROOT=$PWD ctest --stop-on-failure --verbose --schedule-random -E "ops_matmul_test"
47+
cd build && ARK_ROOT=$PWD ARK_IGNORE_BINARY_CACHE=1 ctest --stop-on-failure --verbose --schedule-random
4748
4849
- name: ReportCoverage
4950
run: |

ark/bfloat16_test.cc

+18
Original file line numberDiff line numberDiff line change
@@ -228,8 +228,26 @@ ark::unittest::State test_bfloat16() {
228228
return ark::unittest::SUCCESS;
229229
}
230230

231+
ark::unittest::State test_bfloat16_error() {
232+
ark::bfloat16_t x(0.1f);
233+
ark::bfloat16_t sum(0.0f);
234+
int reduce_length = 256; // should not exceed 2^8
235+
236+
for (int i = 0; i < reduce_length; ++i) {
237+
sum += x * x;
238+
}
239+
240+
// max diff = 2^(-8) * x * 2 * reduce_length = 0.2
241+
UNITTEST_LOG(float(sum));
242+
UNITTEST_TRUE(float(sum) >= 2.36f);
243+
UNITTEST_TRUE(float(sum) <= 2.76f);
244+
245+
return ark::unittest::SUCCESS;
246+
}
247+
231248
int main() {
232249
ark::init();
233250
UNITTEST(test_bfloat16);
251+
UNITTEST(test_bfloat16_error);
234252
return 0;
235253
}

ark/gpu/gpu_compile.cc

+6-8
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,6 @@
2727

2828
#define ARK_DEBUG_KERNEL 0
2929

30-
using namespace std;
31-
3230
namespace ark {
3331

3432
template <typename ItemType>
@@ -151,7 +149,7 @@ const std::string gpu_compile(const std::vector<std::string> &codes,
151149
items.reserve(codes.size());
152150
srand();
153151
for (auto &code : codes) {
154-
string hash_str = fnv1a_hash(code);
152+
std::string hash_str = fnv1a_hash(code);
155153
items.emplace_back(code, "/tmp/ark_" + hash_str);
156154
}
157155
assert(items.size() == 1);
@@ -187,25 +185,25 @@ const std::string gpu_compile(const std::vector<std::string> &codes,
187185
LOG(INFO, "Compiling: ", code_file_path);
188186
LOG(DEBUG, compile_cmd);
189187
// Run the command.
190-
array<char, 4096> buffer;
191-
stringstream exec_print;
192-
unique_ptr<FILE, decltype(&pclose)> pipe(
188+
std::array<char, 4096> buffer;
189+
std::stringstream exec_print;
190+
std::unique_ptr<FILE, decltype(&pclose)> pipe(
193191
popen(compile_cmd.c_str(), "r"), pclose);
194192
if (!pipe) {
195193
ERR(SystemError, "popen() failed");
196194
}
197195
while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) {
198196
exec_print << buffer.data();
199197
}
200-
string exec_print_str = exec_print.str();
198+
std::string exec_print_str = exec_print.str();
201199
if (exec_print_str.size() > 0) {
202200
ERR(ExecutorError, "\n", compile_cmd, "\n", exec_print_str,
203201
"\n");
204202
}
205203
LOG(INFO, "Compile succeed: ", code_file_path, " (",
206204
cpu_timer() - start, " seconds)");
207205
});
208-
string gpubin_file_path = items[0].second + ".cubin";
206+
std::string gpubin_file_path = items[0].second + ".cubin";
209207
return read_file(gpubin_file_path);
210208
}
211209

ark/gpu/gpu_kernel_test.cc

+1-1
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ ark::unittest::State test_gpu_kernel() {
1818

1919
//
2020
const std::string test_kernel_loop_void =
21-
"__device__ void ark_loop_body(int _iter) {\n"
21+
"__device__ void ark_loop_body(char *_buf, int _iter) {\n"
2222
" // Do nothing. Print iteration counter.\n"
2323
" if (threadIdx.x == 0 && blockIdx.x == 0) {\n"
2424
" if (_iter % 50 == 49) {\n"

ark/gpu/gpu_loop_kernel.cc

+2-1
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ GpuLoopKernel::GpuLoopKernel(std::shared_ptr<GpuContext> ctx,
7373
"extern \"C\" __global__ __launch_bounds__(" << block_dim_[0] << ", 1)\n"
7474
"void " << kernel_name_ << "(int *_it)\n"
7575
"{\n"
76+
" char *_buf = " ARK_BUF_NAME ";\n"
7677
" int *shared_mem = (int *)_ARK_SMEM;\n"
7778
" for (int i = threadIdx.x; i < ARK_SMEM_RESERVED_BYTES / sizeof(int); i += blockDim.x) {\n"
7879
" shared_mem[i] = 0;\n"
@@ -88,7 +89,7 @@ GpuLoopKernel::GpuLoopKernel(std::shared_ptr<GpuContext> ctx,
8889
" return;\n"
8990
" }\n"
9091
" for (int _i = 0; _i < _ITER; ++_i) {\n"
91-
" ark_loop_body(_i);\n"
92+
" ark_loop_body(_buf, _i);\n"
9293
" ark::sync_gpu<" << num_sm << ">(" ARK_LSS_NAME ");\n"
9394
" }\n"
9495
" if (threadIdx.x == 0 && blockIdx.x == 0) {\n"

ark/half_test.cc

+18
Original file line numberDiff line numberDiff line change
@@ -227,8 +227,26 @@ ark::unittest::State test_half() {
227227
return ark::unittest::SUCCESS;
228228
}
229229

230+
ark::unittest::State test_half_error() {
231+
ark::half_t x(0.1f);
232+
ark::half_t sum(0.0f);
233+
int reduce_length = 1024; // should not exceed 2^11
234+
235+
for (int i = 0; i < reduce_length; ++i) {
236+
sum += x * x;
237+
}
238+
239+
// max diff = 2^(-11) * x * 2 * reduce_length = 0.1
240+
UNITTEST_LOG(float(sum));
241+
UNITTEST_TRUE(float(sum) >= 10.14f);
242+
UNITTEST_TRUE(float(sum) <= 10.34f);
243+
244+
return ark::unittest::SUCCESS;
245+
}
246+
230247
int main() {
231248
ark::init();
232249
UNITTEST(test_half);
250+
UNITTEST(test_half_error);
233251
return 0;
234252
}

0 commit comments

Comments
 (0)