dusty-nv · HiroshiYAMA · Sep 6, 2020 · Sep 8, 2020
diff --git a/cuda/cudaMerge.cu b/cuda/cudaMerge.cu
@@ -0,0 +1,120 @@
+#include "cudaMerge.h"
+
+
+
+// gpuMerge.
+// * Merge an image on the GPU (supports RGB/BGR, RGBA/BGRA from some single color planes(using GRAY format))
+template<typename T, int CH>
+__global__ void gpuMerge(T *input0, T *input1, T *input2, T *input3, T *output, size_t width, size_t height)
+{
+	const int x = blockIdx.x * blockDim.x + threadIdx.x;
+	const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+	if( x >= width || y >= height )
+		return;
+
+	const T p0 = input0[y * width + x];
+	const T p1 = input1[y * width + x];
+	const T p2 = input2[y * width + x];
+	const T p3 = (CH == 4) ? input3[y * width + x] : T(0);
+
+	output[(y * width + x) * CH + 0] = p0;
+	output[(y * width + x) * CH + 1] = p1;
+	output[(y * width + x) * CH + 2] = p2;
+	if (CH == 4) output[(y * width + x) * CH + 3] = p3;
+}
+
+// gpuMerge.
+// * Merge an image on the GPU (supports RGB/BGR, RGBA/BGRA from 3 colors and alpha plane)
+template<typename T, typename S, typename R>
+__global__ void gpuMerge(T *input_color, S *input_alpha, R *output, size_t width, size_t height)
+{
+	const int x = blockIdx.x * blockDim.x + threadIdx.x;
+	const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+	if( x >= width || y >= height )
+		return;
+
+	T pix_color = input_color[y * width + x];
+	S pix_alpha = input_alpha[y * width + x];
+	R pix = make_vec<R>(pix_color.x, pix_color.y, pix_color.z, pix_alpha);
+
+	output[y * width + x] = pix;
+}
+
+// launchMerge
+// * Merge an image on the GPU (supports RGB/BGR, RGBA/BGRA from some single color planes(using GRAY format))
+template<typename T, int CH>
+static cudaError_t launchMerge(T **input, T *output, size_t width, size_t height)
+{
+	if( !input[0] || !input[1] || !input[2] || (CH == 4 ? !input[3] : false) || !output )
+		return cudaErrorInvalidDevicePointer;
+
+	if( width == 0 || height == 0 )
+		return cudaErrorInvalidValue;
+
+	// launch kernel
+	const dim3 blockDim(32, 8);
+	const dim3 gridDim(iDivUp(width,blockDim.x), iDivUp(height,blockDim.y));
+
+	gpuMerge<T, CH><<<gridDim, blockDim>>>(input[0], input[1], input[2], (CH == 4) ? input[3] : nullptr, output, width, height);
+
+	return CUDA(cudaGetLastError());
+}
+
+// launchMerge
+// * Merge an image on the GPU (supports RGB/BGR, RGBA/BGRA from 3 colors and alpha plane)
+template<typename T, typename S, typename R>
+static cudaError_t launchMerge(T *input_color, S *input_alpha, R *output, size_t width, size_t height)
+{
+	if( !input_color || !input_alpha || !output )
+		return cudaErrorInvalidDevicePointer;
+
+	if( width == 0 || height == 0 )
+		return cudaErrorInvalidValue;
+
+	// launch kernel
+	const dim3 blockDim(32, 8);
+	const dim3 gridDim(iDivUp(width,blockDim.x), iDivUp(height,blockDim.y));
+
+	gpuMerge<T, S, R><<<gridDim, blockDim>>>(input_color, input_alpha, output, width, height);
+
+	return CUDA(cudaGetLastError());
+}
+
+//-----------------------------------------------------------------------------------
+cudaError_t cudaMerge(void **input, void *output, size_t width, size_t height, imageFormat format)
+{
+	if( format == IMAGE_RGB8 || format == IMAGE_BGR8 )
+		return launchMerge<uchar, 3>((uchar **)input, (uchar *)output, width, height);
+	else if( format == IMAGE_RGBA8 || format == IMAGE_BGRA8 )
+		return launchMerge<uchar, 4>((uchar **)input, (uchar *)output, width, height);
+	else if( format == IMAGE_RGB32F || format == IMAGE_BGR32F )
+		return launchMerge<float, 3>((float **)input, (float *)output, width, height);
+	else if( format == IMAGE_RGBA32F || format == IMAGE_BGRA32F )
+		return launchMerge<float, 4>((float **)input, (float *)output, width, height);
+
+	LogError(LOG_CUDA "cudaMerge() -- invalid image format '%s'\n", imageFormatToStr(format));
+	LogError(LOG_CUDA "                supported formats are:\n");
+	LogError(LOG_CUDA "                    * rgb8, bgr8\n");
+	LogError(LOG_CUDA "                    * rgba8, bgra8\n");
+	LogError(LOG_CUDA "                    * rgb32f, bgr32f\n");
+	LogError(LOG_CUDA "                    * rgba32f, bgra32f\n");
+
+	return cudaErrorInvalidValue;
+}
+
+cudaError_t cudaMerge(void *input_color, void *input_alpha, void *output, size_t width, size_t height, imageFormat format)
+{
+	if( format == IMAGE_RGBA8 || format == IMAGE_BGRA8 )
+		return launchMerge<uchar3, uchar, uchar4>((uchar3 *)input_color, (uchar *)input_alpha, (uchar4 *)output, width, height);
+	else if( format == IMAGE_RGBA32F || format == IMAGE_BGRA32F )
+		return launchMerge<float3, float, float4>((float3 *)input_color, (float *)input_alpha, (float4 *)output, width, height);
+
+	LogError(LOG_CUDA "cudaMerge() -- invalid image format '%s'\n", imageFormatToStr(format));
+	LogError(LOG_CUDA "                supported formats are:\n");
+	LogError(LOG_CUDA "                    * rgba8, bgra8\n");
+	LogError(LOG_CUDA "                    * rgba32f, bgra32f\n");
+
+	return cudaErrorInvalidValue;
+}
diff --git a/cuda/cudaMerge.h b/cuda/cudaMerge.h
@@ -0,0 +1,23 @@
+#ifndef __CUDA_MERGE_H__
+#define __CUDA_MERGE_H__
+
+
+#include "cudaUtility.h"
+#include "cudaVector.h"
+#include "imageFormat.h"
+#include <vector>
+
+
+/**
+ * Merge an image on the GPU (supports RGB/BGR, RGBA/BGRA from some single color planes(using GRAY format))
+ * @ingroup merge
+ */
+cudaError_t cudaMerge(void **input, void *output, size_t width, size_t height, imageFormat format);
+
+/**
+ * Merge an image on the GPU (supports RGB/BGR, RGBA/BGRA from 3 colors and alpha plane)
+ * @ingroup merge
+ */
+cudaError_t cudaMerge(void *input_color, void *input_alpha, void *output, size_t width, size_t height, imageFormat format);
+
+#endif
diff --git a/cuda/cudaSplit.cu b/cuda/cudaSplit.cu
@@ -0,0 +1,126 @@
+#include "cudaSplit.h"
+
+
+
+// gpuSplit.
+// * Split an image on the GPU (supports RGB/BGR, RGBA/BGRA to some single color planes(using GRAY format))
+template<typename T, int CH>
+__global__ void gpuSplit(T *input, T *output0, T *output1, T *output2, T *output3, size_t width, size_t height)
+{
+	const int x = blockIdx.x * blockDim.x + threadIdx.x;
+	const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+	if( x >= width || y >= height )
+		return;
+
+	const T p0 = input[(y * width + x) * CH + 0];
+	const T p1 = input[(y * width + x) * CH + 1];
+	const T p2 = input[(y * width + x) * CH + 2];
+	const T p3 = (CH == 4) ? input[(y * width + x) * CH + 3] : T(0);
+
+	output0[y * width + x] = p0;
+	output1[y * width + x] = p1;
+	output2[y * width + x] = p2;
+	if (CH == 4) output3[y * width + x] = p3;
+}
+
+// gpuSplit.
+// * Split an image on the GPU (supports RGBA/BGRA to 3 colors and alpha plane)
+template<typename T, typename S, typename R>
+__global__ void gpuSplit(T *input, S *output_color, R *output_alpha, size_t width, size_t height)
+{
+	const int x = blockIdx.x * blockDim.x + threadIdx.x;
+	const int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+	if( x >= width || y >= height )
+		return;
+
+	T pix4 = input[y * width + x];	// 3 colors + alpha.
+
+	S pix3 = cast_vec<S>(pix4);	// 3 colors.
+	R pix_a = alpha<T>(pix4);	// alpha.
+
+	output_color[y * width + x] = pix3;
+	output_alpha[y * width + x] = pix_a;
+}
+
+// launchSplit
+// * Split an image on the GPU (supports RGB/BGR, RGBA/BGRA to some single color planes(using GRAY format))
+template<typename T, int CH>
+static cudaError_t launchSplit(T *input, T **output, size_t width, size_t height)
+{
+	if( !input || !output[0] || !output[1] || !output[2] || (CH == 4 ? !output[3] : false) )
+		return cudaErrorInvalidDevicePointer;
+
+	if( width == 0 || height == 0 )
+		return cudaErrorInvalidValue;
+
+	// launch kernel
+	const dim3 blockDim(32, 8);
+	const dim3 gridDim(iDivUp(width,blockDim.x), iDivUp(height,blockDim.y));
+
+	gpuSplit<T, CH><<<gridDim, blockDim>>>(input, output[0], output[1], output[2], (CH == 4) ? output[3] : nullptr, width, height);
+	// std::vector<T *> out_ary;
+	// for (int i = 0; i < CH; i++) out_ary.push_back(output[i]);
+	// printf("=====================\n");
+	// gpuSplit<T, CH><<<gridDim, blockDim>>>(input, out_ary, width, height);
+
+	return CUDA(cudaGetLastError());
+}
+
+// launchSplit
+// * Split an image on the GPU (supports RGBA/BGRA to 3 colors and alpha plane)
+template<typename T, typename S, typename R>
+static cudaError_t launchSplit(T *input, S *output_color, R *output_alpha, size_t width, size_t height)
+{
+	if( !input || !output_color || !output_alpha )
+		return cudaErrorInvalidDevicePointer;
+
+	if( width == 0 || height == 0 )
+		return cudaErrorInvalidValue;
+
+	// launch kernel
+	const dim3 blockDim(32, 8);
+	const dim3 gridDim(iDivUp(width,blockDim.x), iDivUp(height,blockDim.y));
+
+	gpuSplit<T, S, R><<<gridDim, blockDim>>>(input, output_color, output_alpha, width, height);
+
+	return CUDA(cudaGetLastError());
+}
+
+//-----------------------------------------------------------------------------------
+cudaError_t cudaSplit(void *input, void **output, size_t width, size_t height, imageFormat format)
+{
+	if( format == IMAGE_RGB8 || format == IMAGE_BGR8 )
+		return launchSplit<uchar, 3>((uchar *)input, (uchar **)output, width, height);
+	else if( format == IMAGE_RGBA8 || format == IMAGE_BGRA8 )
+		return launchSplit<uchar, 4>((uchar *)input, (uchar **)output, width, height);
+	else if( format == IMAGE_RGB32F || format == IMAGE_BGR32F )
+		return launchSplit<float, 3>((float *)input, (float **)output, width, height);
+	else if( format == IMAGE_RGBA32F || format == IMAGE_BGRA32F )
+		return launchSplit<float, 4>((float *)input, (float **)output, width, height);
+
+	LogError(LOG_CUDA "cudaSplit() -- invalid image format '%s'\n", imageFormatToStr(format));
+	LogError(LOG_CUDA "                supported formats are:\n");
+	LogError(LOG_CUDA "                    * rgb8, bgr8\n");
+	LogError(LOG_CUDA "                    * rgba8, bgra8\n");
+	LogError(LOG_CUDA "                    * rgb32f, bgr32f\n");
+	LogError(LOG_CUDA "                    * rgba32f, bgra32f\n");
+
+	return cudaErrorInvalidValue;
+}
+
+cudaError_t cudaSplit(void *input, void *output_color, void *output_alpha, size_t width, size_t height, imageFormat format)
+{
+	if( format == IMAGE_RGBA8 || format == IMAGE_BGRA8 )
+		return launchSplit<uchar4, uchar3, uchar>((uchar4 *)input, (uchar3 *)output_color, (uchar *)output_alpha, width, height);
+	else if( format == IMAGE_RGBA32F || format == IMAGE_BGRA32F )
+		return launchSplit<float4, float3, float>((float4 *)input, (float3 *)output_color, (float *)output_alpha, width, height);
+
+	LogError(LOG_CUDA "cudaSplit() -- invalid image format '%s'\n", imageFormatToStr(format));
+	LogError(LOG_CUDA "                supported formats are:\n");
+	LogError(LOG_CUDA "                    * rgba8, bgra8\n");
+	LogError(LOG_CUDA "                    * rgba32f, bgra32f\n");
+
+	return cudaErrorInvalidValue;
+}
diff --git a/cuda/cudaSplit.h b/cuda/cudaSplit.h
@@ -0,0 +1,23 @@
+#ifndef __CUDA_SPLIT_H__
+#define __CUDA_SPLIT_H__
+
+
+#include "cudaUtility.h"
+#include "cudaVector.h"
+#include "imageFormat.h"
+#include <vector>
+
+
+/**
+ * Split an image on the GPU (supports RGB/BGR, RGBA/BGRA to some single color planes(using GRAY format))
+ * @ingroup split
+ */
+cudaError_t cudaSplit(void *input, void **output, size_t width, size_t height, imageFormat format);
+
+/**
+ * Split an image on the GPU (supports RGBA/BGRA to 3 colors and alpha plane)
+ * @ingroup split
+ */
+cudaError_t cudaSplit(void *input, void *output_color, void *output_alpha, size_t width, size_t height, imageFormat format);
+
+#endif