Skip to content

Commit b599401

Browse files
Cleanup
1 parent 11e2e92 commit b599401

File tree

5 files changed

+0
-104
lines changed

5 files changed

+0
-104
lines changed

csrc/kernels.cu

-66
Original file line numberDiff line numberDiff line change
@@ -2352,69 +2352,6 @@ __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *o
23522352
}
23532353
}
23542354

2355-
template <int FORMAT> __global__ void kExtractOutliers(char *A, int *idx, char *out, int idx_size, int rowsA, int colsA, int tiledRowsA, int tiledColsA)
2356-
{
2357-
int local_colidx = idx[blockIdx.x];
2358-
2359-
if(FORMAT==COL_TURING)
2360-
{
2361-
// TURING FORMAT:
2362-
// 8*32 tiles with 4*4 subtiles
2363-
// the 8*32 subtile has first all 4*4 subtiles of even rows (max 4*4*8 = 128 elements)
2364-
// the subsequent 4*4 subtiles are for all odd rows if some rows columns are empty the values are zero
2365-
// the tile repeats again after the 8*32 tile in a major column order, meaning: (next 8 rows are A[8:16, 0:32])
2366-
// the next tile is the next 8 rows for the same 32 columns. Once all rows are finished, the column
2367-
// index increases by 32
2368-
// columns are grouped in increments of 4, meaning that one has the following rows and columns
2369-
// rows: [0 0 0 0, 2 2 2 2, 4 4 4 4, 6 6 6 6, 0 0 0 0 ...]
2370-
// cols: [0 1 2 3, 0 1 2 4, 0 1 2 3, 0 1 2 3, 4 5 6 7 ...]
2371-
2372-
// each thread reads 1 element = 1 row
2373-
for(int row = threadIdx.x; row < rowsA; row+= blockDim.x)
2374-
{
2375-
int offset_per_col_tile = ((rowsA+7)/8)*32*8;
2376-
int tile_offset_rows = (row/8)*32*8;
2377-
int tile_offset_cols = (local_colidx/32)*offset_per_col_tile;
2378-
int offset = 0;
2379-
int subtile_col_idx = local_colidx%32;
2380-
int subtile_row_idx = row % 8;
2381-
if(row % 2 == 1)
2382-
offset += 128 + (subtile_col_idx/4)*16 + (subtile_col_idx%4) + ((subtile_row_idx-1)*2);
2383-
else
2384-
// even
2385-
offset += 0 + (subtile_col_idx/4)*16 + (subtile_col_idx%4) + (subtile_row_idx*2);
2386-
2387-
offset += tile_offset_rows + tile_offset_cols;
2388-
2389-
char val = A[offset];
2390-
2391-
int out_idx = (row*idx_size) + blockIdx.x;
2392-
out[out_idx] = val;
2393-
}
2394-
}
2395-
else if(FORMAT == COL_AMPERE)
2396-
{
2397-
2398-
for(int row = threadIdx.x; row < rowsA; row+= blockDim.x)
2399-
{
2400-
// we got 32x32 tiles and we use the magic equation from the cublasLt doc to get the element
2401-
// within each tile.
2402-
int offset_per_col_tile = ((rowsA+31)/32)*32*32;
2403-
int tile_offset_rows = (row/32)*32*32;
2404-
int tile_offset_cols = (local_colidx/32)*offset_per_col_tile;
2405-
int subtile_col_idx = local_colidx%32;
2406-
int subtile_row_idx = row % 32;
2407-
// this magic is taken from the cublasLt doc (search for COL32)
2408-
int offset = (((subtile_row_idx%8)/2*4+subtile_row_idx/8)*2+subtile_row_idx%2)*32+subtile_col_idx;
2409-
offset += tile_offset_cols + tile_offset_rows;
2410-
2411-
char val = A[offset];
2412-
int out_idx = (row*idx_size) + blockIdx.x;
2413-
out[out_idx] = val;
2414-
}
2415-
}
2416-
}
2417-
24182355
#define WARPS 3
24192356
template <typename T, int BITS, int THREADS> __global__ void gemm_device(int M, int N, int K, T * __restrict__ const A, T* B, T * out, int lda, int ldb, int ldc)
24202357
{
@@ -3049,9 +2986,6 @@ template __global__ void kgemm_4bit_inference_naive<half, 128, 16>(int M, int N,
30492986
template __global__ void kgemm_4bit_inference_naive<__nv_bfloat16, 128, 16>(int M, int N, int K, __nv_bfloat16 * __restrict__ const A, unsigned char *B, float *absmax, const float *datatype, __nv_bfloat16 * out, int lda, int ldb, int ldc, int blocksize);
30502987
template __global__ void kgemm_4bit_inference_naive<float, 128, 32>(int M, int N, int K, float * __restrict__ const A, unsigned char *B, float *absmax, const float *datatype, float * out, int lda, int ldb, int ldc, int blocksize);
30512988

3052-
template __global__ void kExtractOutliers<COL_TURING>(char *A, int *idx, char *out, int idx_size, int rowsA, int colsA, int tiledRowsA, int tiledColsA);
3053-
template __global__ void kExtractOutliers<COL_AMPERE>(char *A, int *idx, char *out, int idx_size, int rowsA, int colsA, int tiledRowsA, int tiledColsA);
3054-
30552989
template __global__ void kspmm_coo_very_sparse_naive<half, 8, 16>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
30562990
template __global__ void kspmm_coo_very_sparse_naive<half, 16, 16>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
30572991
template __global__ void kspmm_coo_very_sparse_naive<half, 32, 16>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB);

csrc/kernels.cuh

-2
Original file line numberDiff line numberDiff line change
@@ -121,8 +121,6 @@ template<typename T, int THREADS, int SPARSE_DECOMP> __global__ void kInt8Vector
121121

122122
template <int THREADS, int ITEMS_PER_THREAD, int TILE_ROWS, int TILE_COLS, int TRANSPOSE, int FORMAT> __global__ void kTransformRowToFormat(char *__restrict__ const A, char *out, int rows, int cols, int tiledCols, int outRows, int outCols);
123123

124-
template <int FORMAT> __global__ void kExtractOutliers(char *A, int *idx, char *out, int idx_size, int rowsA, int colsA, int tiledRowsA, int tiledColsA);
125-
126124
template <typename T, int BITS, int THREADS> __global__ void gemm_device(int M, int N, int K, T * __restrict__ const A, T* B, T * out, int lda, int ldb, int ldc);
127125
template <typename T, int THREADS> __global__ void kgemm_4bit_inference(int M, int N, int K, T * __restrict__ const A, unsigned char *B, float *absmax, T * out, int lda, int ldb, int ldc, int blocksize);
128126
template <typename T, int THREADS, int BITS> __global__ void kgemm_4bit_inference_naive(int M, int N, int K, T * __restrict__ const A, unsigned char *B, float *absmax, const float *datatype, T * out, int lda, int ldb, int ldc, int blocksize);

csrc/ops.cu

-28
Original file line numberDiff line numberDiff line change
@@ -557,32 +557,6 @@ template <typename T, int BITS> void spmm_coo_very_sparse_naive(int *max_count,
557557
CUDA_CHECK_RETURN(cudaPeekAtLastError());
558558
}
559559

560-
561-
template <int FORMAT> void extractOutliers(char * A, int *idx, char *out, int idx_size, int rows, int cols)
562-
{
563-
int threads = 256;
564-
// we load 128 column values per warp
565-
int tiledCols = tiledCols = fill_up_to_nearest_multiple(cols, 32);
566-
int tiledRows = 0;
567-
568-
int num_blocks = idx_size;
569-
570-
if(FORMAT == COL_TURING)
571-
{
572-
tiledRows = fill_up_to_nearest_multiple(rows, 8);
573-
}
574-
else if(FORMAT == COL_AMPERE)
575-
{
576-
tiledRows = fill_up_to_nearest_multiple(rows, 32);
577-
}
578-
579-
kExtractOutliers<FORMAT><<<num_blocks, threads>>>(A, idx, out, idx_size, rows, cols, tiledRows, tiledCols);
580-
CUDA_CHECK_RETURN(cudaPeekAtLastError());
581-
}
582-
583-
584-
585-
586560
template <typename T> void gemm_host(int m, int n, int k, T * A, T* B, T * out, int lda, int ldb, int ldc, int bits)
587561
{
588562

@@ -636,8 +610,6 @@ template void gemm_4bit_inference_naive<float, 32>(int m, int n, int k, float *
636610

637611
//template void gemm_host<float>(int m, int n, int k, float * A, float* B, float * out, int lda, int ldb, int ldc, int bits);
638612
template void gemm_host<half>(int m, int n, int k, half * A, half* B, half * out, int lda, int ldb, int ldc, int bits);
639-
template void extractOutliers<COL_TURING>(char * A, int *idx, char *out, int idx_size, int rows, int cols);
640-
template void extractOutliers<COL_AMPERE>(char * A, int *idx, char *out, int idx_size, int rows, int cols);
641613

642614
template void spmm_coo_very_sparse_naive<half, 16>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float *dequant_stats, int nnz_rows, int nnz, int rowsA, int rowsB, int colsB);
643615
template void spmm_coo_very_sparse_naive<signed char, 8>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float *dequant_stats, int nnz_rows, int nnz, int rowsA, int rowsB, int colsB);

csrc/ops.cuh

-2
Original file line numberDiff line numberDiff line change
@@ -182,8 +182,6 @@ void spmm_coo(cusparseHandle_t handle, int *A_rowidx, int *A_colidx, half *A_val
182182

183183
template <typename T, int BITS> void spmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, T *B, half *out, float *dequant_stats, int nnz_rows, int nnz, int rowsA, int rowsB, int colsB);
184184

185-
template <int FORMAT> void extractOutliers(char * A, int *idx, char *out, int idx_size, int rows, int cols);
186-
187185
void matmul4bite(half *A, unsigned char *B, half*out, int lda, int ldb, int rowsA, int colsA, int colsB);
188186

189187
template <typename T> void gemm_host(int m, int n, int k, T * A, T* B, T * out, int lda, int ldb, int ldc, int bits);

csrc/pythonInterface.cpp

-6
Original file line numberDiff line numberDiff line change
@@ -149,9 +149,6 @@ void dequantizeBlockwise_bf16(float *code, unsigned char *A, float *absmax, __nv
149149
void dequantizeBlockwise_bf16_fp4(float *code, unsigned char *A, float *absmax, __nv_bfloat16 *out, int blocksize, const int n, cudaStream_t stream){ dequantizeBlockwise<__nv_bfloat16, FP4>(NULL, A, absmax, out, blocksize, n, stream); }
150150
void dequantizeBlockwise_bf16_nf4(float *code, unsigned char *A, float *absmax, __nv_bfloat16 *out, int blocksize, const int n, cudaStream_t stream){ dequantizeBlockwise<__nv_bfloat16, NF4>(NULL, A, absmax, out, blocksize, n, stream); }
151151

152-
void extractOutliers_turing(char * A, int *idx, char *out, int idx_size, int rows, int cols){ extractOutliers<COL_TURING>(A, idx, out, idx_size, rows, cols); }
153-
void extractOutliers_ampere(char * A, int *idx, char *out, int idx_size, int rows, int cols){ extractOutliers<COL_AMPERE>(A, idx, out, idx_size, rows, cols); }
154-
155152
int igemmlt_32(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc, cudaStream_t stream) {
156153
return igemmlt<32, 0>(ltHandle, m, n, k, A, B, C, row_scale, lda, ldb, ldc, stream);
157154
}
@@ -312,9 +309,6 @@ extern "C"
312309
void cspmm_coo_very_sparse_naive_int8(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float *dequant_stats, int nnz_rows, int nnz, int rowsA, int rowsB, int colsB)
313310
{ spmm_coo_very_sparse_naive_int8(max_count, max_idx, offset_rowidx, rowidx, colidx, values, B, out, dequant_stats, nnz_rows, nnz, rowsA, rowsB, colsB); }
314311

315-
void cextractOutliers_turing(char * A, int *idx, char *out, int idx_size, int rows, int cols){ extractOutliers_turing(A, idx, out, idx_size, rows, cols); }
316-
void cextractOutliers_ampere(char * A, int *idx, char *out, int idx_size, int rows, int cols){ extractOutliers_ampere(A, idx, out, idx_size, rows, cols); }
317-
318312
//void cgemm_host_fp32(int M, int N, int K, float * A, float* B, float * out, int lda, int ldb, int ldc)
319313
//{ gemm_host_fp32(M, N, K, A, B, out, lda, ldb, ldc); }
320314

0 commit comments

Comments
 (0)