@@ -8,12 +8,12 @@ namespace onnxruntime {
88namespace cuda {
99
1010template <typename T, int RANK>
11- __global__ void _UpampleNearestKernel (const TArray<int64_t > input_pitches,
12- const TArray<fast_divmod> output_div_pitches,
13- const TArray<fast_divmod> scales_div,
14- const T* __restrict__ input_data,
15- T* __restrict__ output_data,
16- const size_t N) {
11+ __global__ void _UpsampleNearestKernel (const TArray<int64_t > input_pitches,
12+ const TArray<fast_divmod> output_div_pitches,
13+ const TArray<fast_divmod> scales_div,
14+ const T* __restrict__ input_data,
15+ T* __restrict__ output_data,
16+ const size_t N) {
1717 CALCULATE_ELEMENTWISE_INDEX_OR_EXIT (id, N);
1818 CUDA_LONG input_index = 0 ;
1919 CUDA_LONG output_index = id;
@@ -36,13 +36,13 @@ __global__ void _UpampleNearestKernel(const TArray<int64_t> input_pitches,
3636// This is the common use-case where the 4-D input (batched multi-channel images)
3737// is usually of shape [N, C, H, W] and the scales are [1.0, 1.0, height_scale, width_scale]
3838template <typename T>
39- __global__ void _UpampleBilinear4DInputKernel (const int64_t input_dim2,
40- const TArray<int64_t > input_pitches,
41- const TArray<fast_divmod> output_div_pitches,
42- const TArray<fast_divmod> scales_div,
43- const T* __restrict__ input_data,
44- T* __restrict__ output_data,
45- const size_t N) {
39+ __global__ void _UpsampleBilinear4DInputKernel (const int64_t input_dim2,
40+ const TArray<int64_t > input_pitches,
41+ const TArray<fast_divmod> output_div_pitches,
42+ const TArray<fast_divmod> scales_div,
43+ const T* __restrict__ input_data,
44+ T* __restrict__ output_data,
45+ const size_t N) {
4646 CALCULATE_ELEMENTWISE_INDEX_OR_EXIT (id, N);
4747 CUDA_LONG input_index = 0 ;
4848
@@ -95,13 +95,13 @@ __global__ void _UpampleBilinear4DInputKernel(const int64_t input_dim2,
9595
9696// The following method supports a 2-D input in 'Linear mode'
9797template <typename T>
98- __global__ void _UpampleBilinear2DInputKernel (const int64_t input_dim0,
99- const TArray<int64_t > input_pitches,
100- const TArray<fast_divmod> output_div_pitches,
101- const TArray<fast_divmod> scales_div,
102- const T* __restrict__ input_data,
103- T* __restrict__ output_data,
104- const size_t N) {
98+ __global__ void _UpsampleBilinear2DInputKernel (const int64_t input_dim0,
99+ const TArray<int64_t > input_pitches,
100+ const TArray<fast_divmod> output_div_pitches,
101+ const TArray<fast_divmod> scales_div,
102+ const T* __restrict__ input_data,
103+ T* __restrict__ output_data,
104+ const size_t N) {
105105 CALCULATE_ELEMENTWISE_INDEX_OR_EXIT (id, N);
106106 CUDA_LONG input_index = 0 ;
107107
@@ -147,44 +147,44 @@ __global__ void _UpampleBilinear2DInputKernel(const int64_t input_dim0,
147147}
148148
149149template <typename T>
150- void UpampleImpl (cudaStream_t stream,
151- const onnxruntime::UpsampleMode upsample_mode,
152- const size_t rank,
153- const int64_t input_dim2,
154- const TArray<int64_t >& input_pitches,
155- const TArray<fast_divmod>& output_div_pitches,
156- const TArray<fast_divmod>& scales_div,
157- const T* input_data,
158- T* output_data,
159- const size_t N) {
150+ void UpsampleImpl (cudaStream_t stream,
151+ const onnxruntime::UpsampleMode upsample_mode,
152+ const size_t rank,
153+ const int64_t input_dim2,
154+ const TArray<int64_t >& input_pitches,
155+ const TArray<fast_divmod>& output_div_pitches,
156+ const TArray<fast_divmod>& scales_div,
157+ const T* input_data,
158+ T* output_data,
159+ const size_t N) {
160160 int blocksPerGrid = (int )(ceil (static_cast <float >(N) / GridDim::maxThreadsPerBlock));
161161 if (onnxruntime::UpsampleMode::NN == upsample_mode) {
162162 if (rank == 4 ) {
163- _UpampleNearestKernel <T, 4 ><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0 , stream>>> (
163+ _UpsampleNearestKernel <T, 4 ><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0 , stream>>> (
164164 input_pitches, output_div_pitches, scales_div,
165165 input_data, output_data, N);
166166 } else if (rank == 3 ) {
167- _UpampleNearestKernel <T, 3 ><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0 , stream>>> (
167+ _UpsampleNearestKernel <T, 3 ><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0 , stream>>> (
168168 input_pitches, output_div_pitches, scales_div,
169169 input_data, output_data, N);
170170 } else if (rank == 2 ) {
171- _UpampleNearestKernel <T, 2 ><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0 , stream>>> (
171+ _UpsampleNearestKernel <T, 2 ><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0 , stream>>> (
172172 input_pitches, output_div_pitches, scales_div,
173173 input_data, output_data, N);
174174 } else if (rank == 1 ) {
175- _UpampleNearestKernel <T, 1 ><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0 , stream>>> (
175+ _UpsampleNearestKernel <T, 1 ><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0 , stream>>> (
176176 input_pitches, output_div_pitches, scales_div,
177177 input_data, output_data, N);
178178 } else {
179179 ORT_THROW (" Unsupported rank by the Upsample CUDA kernel. Input rank: " , rank);
180180 }
181181 } else if (onnxruntime::UpsampleMode::LINEAR == upsample_mode) {
182182 if (rank == 4 ) {
183- _UpampleBilinear4DInputKernel <T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0 , stream>>> (
183+ _UpsampleBilinear4DInputKernel <T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0 , stream>>> (
184184 input_dim2, input_pitches, output_div_pitches, scales_div,
185185 input_data, output_data, N);
186186 } else if (rank == 2 ) {
187- _UpampleBilinear2DInputKernel <T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0 , stream>>> (
187+ _UpsampleBilinear2DInputKernel <T><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0 , stream>>> (
188188 input_dim2, input_pitches, output_div_pitches, scales_div,
189189 input_data, output_data, N);
190190 } else {
@@ -197,17 +197,17 @@ void UpampleImpl(cudaStream_t stream,
197197 }
198198}
199199
200- #define SPECIALIZED_IMPL (T ) \
201- template void UpampleImpl <T>(cudaStream_t stream, \
202- const onnxruntime::UpsampleMode upsample_mode, \
203- const size_t rank, \
204- const int64_t input_dim2, \
205- const TArray<int64_t >& input_pitches, \
206- const TArray<fast_divmod>& output_div_pitches, \
207- const TArray<fast_divmod>& scales_div, \
208- const T* input_data, \
209- T* output_data, \
210- const size_t N);
200+ #define SPECIALIZED_IMPL (T ) \
201+ template void UpsampleImpl <T>(cudaStream_t stream, \
202+ const onnxruntime::UpsampleMode upsample_mode, \
203+ const size_t rank, \
204+ const int64_t input_dim2, \
205+ const TArray<int64_t >& input_pitches, \
206+ const TArray<fast_divmod>& output_div_pitches, \
207+ const TArray<fast_divmod>& scales_div, \
208+ const T* input_data, \
209+ T* output_data, \
210+ const size_t N);
211211
212212SPECIALIZED_IMPL (float )
213213SPECIALIZED_IMPL (double )
0 commit comments