diff --git a/mmros/include/mmros/process/image.hpp b/mmros/include/mmros/process/image.hpp
index 5399f3f..047f00b 100644
--- a/mmros/include/mmros/process/image.hpp
+++ b/mmros/include/mmros/process/image.hpp
@@ -1,4 +1,4 @@
-// Copyright 2023 TIER IV, Inc.
+// Copyright 2025 Kotaro Uetake.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -15,189 +15,28 @@
 #ifndef MMROS__PROCESS__IMAGE_HPP_
 #define MMROS__PROCESS__IMAGE_HPP_
 
-#include <cublas_v2.h>
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <cuda_runtime_api.h>
-#include <curand.h>
+#include "mmros/process/image_kernel.hpp"
 
-namespace mmros::process
-{
-struct Roi
-{
-  int x;
-  int y;
-  int w;
-  int h;
-};
-
-/**
- * @brief Resize a image using bilinear interpolation on gpus
- * @param[out] dst Resized image
- * @param[in] src image
- * @param[in] d_w width for resized image
- * @param[in] d_h height for resized image
- * @param[in] d_c channel for resized image
- * @param[in] s_w width for input image
- * @param[in] s_h height for input image
- * @param[in] s_c channel for input image
- * @param[in] stream cuda stream
- */
-extern void resize_bilinear_gpu(
-  unsigned char * dst, unsigned char * src, int d_w, int d_h, int d_c, int s_w, int s_h, int s_c,
-  cudaStream_t stream);
-
-/**
- * @brief Letterbox a image on gpus
- * @param[out] dst letterbox-ed image
- * @param[in] src image
- * @param[in] d_w width for letterbox-ing
- * @param[in] d_h height for letterbox-ing
- * @param[in] d_c channel for letterbox-ing
- * @param[in] s_w width for input image
- * @param[in] s_h height for input image
- * @param[in] s_c channel for input image
- * @param[in] stream cuda stream
- */
-extern void letterbox_gpu(
-  unsigned char * dst, unsigned char * src, int d_w, int d_h, int d_c, int s_w, int s_h, int s_c,
-  cudaStream_t stream);
-
-/**
- * @brief NHWC to NHWC conversion
- * @param[out] dst converted image
- * @param[in] src image
- * @param[in] d_w width for a image
- * @param[in] d_h height for a image
- * @param[in] d_c channel for a image
- * @param[in] stream cuda stream
- */
-extern void nchw_to_nhwc_gpu(
-  unsigned char * dst, unsigned char * src, int d_w, int d_h, int d_c, cudaStream_t stream);
-
-/**
- * @brief Unsigned char to float32 for inference
- * @param[out] dst32 converted image
- * @param[in] src image
- * @param[in] d_w width for a image
- * @param[in] d_h height for a image
- * @param[in] d_c channel for a image
- * @param[in] stream cuda stream
- */
-extern void to_float_gpu(
-  float * dst32, unsigned char * src, int d_w, int d_h, int d_c, cudaStream_t stream);
-
-/**
- * @brief Resize and letterbox a image using bilinear interpolation on gpus
- * @param[out] dst processed image
- * @param[in] src image
- * @param[in] d_w width for output
- * @param[in] d_h height for output
- * @param[in] d_c channel for output
- * @param[in] s_w width for input
- * @param[in] s_h height for input
- * @param[in] s_c channel for input
- * @param[in] stream cuda stream
- */
-extern void resize_bilinear_letterbox_gpu(
-  unsigned char * dst, unsigned char * src, int d_w, int d_h, int d_c, int s_w, int s_h, int s_c,
-  cudaStream_t stream);
-
-/**
- * @brief Optimized preprocessing including resize, letterbox, nhwc2nchw, toFloat and normalization
- * for YOLOX on gpus
- * @param[out] dst processed image
- * @param[in] src image
- * @param[in] d_w width for output
- * @param[in] d_h height for output
- * @param[in] d_c channel for output
- * @param[in] s_w width for input
- * @param[in] s_h height for input
- * @param[in] s_c channel for input
- * @param[in] mean mean values for each channel
- * @param[in] std std values for each channel
- * @param[in] stream cuda stream
- */
-extern void resize_bilinear_letterbox_nhwc_to_nchw32_gpu(
-  float * dst, unsigned char * src, int d_w, int d_h, int d_c, int s_w, int s_h, int s_c,
-  float * mean, float * std, cudaStream_t stream);
-
-/**
- * @brief Optimized preprocessing including resize, letterbox, nhwc2nchw, toFloat and normalization
- * with batching for YOLOX on gpus
- * @param[out] dst processed image
- * @param[in] src image
- * @param[in] d_w width for output
- * @param[in] d_h height for output
- * @param[in] d_c channel for output
- * @param[in] s_w width for input
- * @param[in] s_h height for input
- * @param[in] s_c channel for input
- * @param[in] batch batch size
- * @param[in] mean mean values for each channel
- * @param[in] std std values for each channel
- * @param[in] stream cuda stream
- */
-extern void resize_bilinear_letterbox_nhwc_to_nchw32_batch_gpu(
-  float * dst, unsigned char * src, int d_w, int d_h, int d_c, int s_w, int s_h, int s_c, int batch,
-  float * mean, float * std, cudaStream_t stream);
+#include <opencv2/core/mat.hpp>
 
-/**
- * @brief Optimized preprocessing including crop, resize, letterbox, nhwc2nchw, toFloat and
- * normalization with batching for YOLOX on gpus
- * @param[out] dst processed image
- * @param[in] src image
- * @param[in] d_w width for output
- * @param[in] d_h height for output
- * @param[in] d_c channel for output
- * @param[in] s_w width for input
- * @param[in] s_h height for input
- * @param[in] s_c channel for input
- * @param[in] d_roi regions of interest for cropping
- * @param[in] batch batch size
- * @param[in] mean mean values for each channel
- * @param[in] std std values for each channel
- * @param[in] stream cuda stream
- */
-extern void crop_resize_bilinear_letterbox_nhwc_to_nchw32_batch_gpu(
-  float * dst, unsigned char * src, int d_w, int d_h, int d_c, Roi * d_roi, int s_w, int s_h,
-  int s_c, int batch, float * mean, float * std, cudaStream_t stream);
-
-/**
- * @brief Optimized multi-scale preprocessing including crop, resize, letterbox, nhwc2nchw, toFloat
- * and normalization with batching for YOLOX on gpus
- * @param[out] dst processed image
- * @param[in] src image
- * @param[in] d_w width for output
- * @param[in] d_h height for output
- * @param[in] d_c channel for output
- * @param[in] s_w width for input
- * @param[in] s_h height for input
- * @param[in] s_c channel for input
- * @param[in] d_roi regions of interest for cropping
- * @param[in] batch batch size
- * @param[in] mean mean values for each channel
- * @param[in] std std values for each channel
- * @param[in] stream cuda stream
- */
-extern void multi_scale_resize_bilinear_letterbox_nhwc_to_nchw32_batch_gpu(
-  float * dst, unsigned char * src, int d_w, int d_h, int d_c, Roi * d_roi, int s_w, int s_h,
-  int s_c, int batch, float * mean, float * std, cudaStream_t stream);
+#include <vector>
 
+namespace mmros::process
+{
 /**
- * @brief Argmax on GPU
- * @param[out] dst processed image
- * @param[in] src probability map
- * @param[in] d_w width for output
- * @param[in] d_h height for output
- * @param[in] s_w width for input
- * @param[in] s_h height for input
- * @param[in] s_c channel for input
- * @param[in] batch batch size
- * @param[in] stream cuda stream
- */
-extern void argmax_gpu(
-  unsigned char * dst, float * src, int d_w, int d_h, int s_w, int s_h, int s_c, int batch,
-  cudaStream_t stream);
+ * Run preprocessing for image.
+ *
+ * @param input_d Pointer to input image on device.
+ * @param scales Mutable reference to the vector of scale factors.
+ * @param images Read-only reference to the vector of source images.
+ * @param in_width Model input width.
+ * @param in_height Model input height.
+ * @param mean Pointer to the image mean values.
+ * @param std Pointer to the image std values.
+ * @param stream CUDA stream.
+ */
+void preprocess_image(
+  float * input_d, std::vector<float> & scales, const std::vector<cv::Mat> & images,
+  int64_t in_width, int64_t in_height, float * mean, float * std, cudaStream_t stream);
 }  // namespace mmros::process
 #endif  // MMROS__PROCESS__IMAGE_HPP_
diff --git a/mmros/include/mmros/process/image_kernel.hpp b/mmros/include/mmros/process/image_kernel.hpp
new file mode 100644
index 0000000..885720b
--- /dev/null
+++ b/mmros/include/mmros/process/image_kernel.hpp
@@ -0,0 +1,203 @@
+// Copyright 2023 TIER IV, Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef MMROS__PROCESS__IMAGE_KERNEL_HPP_
+#define MMROS__PROCESS__IMAGE_KERNEL_HPP_
+
+#include <cublas_v2.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_runtime_api.h>
+#include <curand.h>
+
+namespace mmros::process
+{
+struct Roi
+{
+  int x;
+  int y;
+  int w;
+  int h;
+};
+
+/**
+ * @brief Resize a image using bilinear interpolation on gpus
+ * @param[out] dst Resized image
+ * @param[in] src image
+ * @param[in] d_w width for resized image
+ * @param[in] d_h height for resized image
+ * @param[in] d_c channel for resized image
+ * @param[in] s_w width for input image
+ * @param[in] s_h height for input image
+ * @param[in] s_c channel for input image
+ * @param[in] stream cuda stream
+ */
+extern void resize_bilinear_gpu(
+  unsigned char * dst, unsigned char * src, int d_w, int d_h, int d_c, int s_w, int s_h, int s_c,
+  cudaStream_t stream);
+
+/**
+ * @brief Letterbox a image on gpus
+ * @param[out] dst letterbox-ed image
+ * @param[in] src image
+ * @param[in] d_w width for letterbox-ing
+ * @param[in] d_h height for letterbox-ing
+ * @param[in] d_c channel for letterbox-ing
+ * @param[in] s_w width for input image
+ * @param[in] s_h height for input image
+ * @param[in] s_c channel for input image
+ * @param[in] stream cuda stream
+ */
+extern void letterbox_gpu(
+  unsigned char * dst, unsigned char * src, int d_w, int d_h, int d_c, int s_w, int s_h, int s_c,
+  cudaStream_t stream);
+
+/**
+ * @brief NHWC to NHWC conversion
+ * @param[out] dst converted image
+ * @param[in] src image
+ * @param[in] d_w width for a image
+ * @param[in] d_h height for a image
+ * @param[in] d_c channel for a image
+ * @param[in] stream cuda stream
+ */
+extern void nchw_to_nhwc_gpu(
+  unsigned char * dst, unsigned char * src, int d_w, int d_h, int d_c, cudaStream_t stream);
+
+/**
+ * @brief Unsigned char to float32 for inference
+ * @param[out] dst32 converted image
+ * @param[in] src image
+ * @param[in] d_w width for a image
+ * @param[in] d_h height for a image
+ * @param[in] d_c channel for a image
+ * @param[in] stream cuda stream
+ */
+extern void to_float_gpu(
+  float * dst32, unsigned char * src, int d_w, int d_h, int d_c, cudaStream_t stream);
+
+/**
+ * @brief Resize and letterbox a image using bilinear interpolation on gpus
+ * @param[out] dst processed image
+ * @param[in] src image
+ * @param[in] d_w width for output
+ * @param[in] d_h height for output
+ * @param[in] d_c channel for output
+ * @param[in] s_w width for input
+ * @param[in] s_h height for input
+ * @param[in] s_c channel for input
+ * @param[in] stream cuda stream
+ */
+extern void resize_bilinear_letterbox_gpu(
+  unsigned char * dst, unsigned char * src, int d_w, int d_h, int d_c, int s_w, int s_h, int s_c,
+  cudaStream_t stream);
+
+/**
+ * @brief Optimized preprocessing including resize, letterbox, nhwc2nchw, toFloat and normalization
+ * for YOLOX on gpus
+ * @param[out] dst processed image
+ * @param[in] src image
+ * @param[in] d_w width for output
+ * @param[in] d_h height for output
+ * @param[in] d_c channel for output
+ * @param[in] s_w width for input
+ * @param[in] s_h height for input
+ * @param[in] s_c channel for input
+ * @param[in] mean mean values for each channel
+ * @param[in] std std values for each channel
+ * @param[in] stream cuda stream
+ */
+extern void resize_bilinear_letterbox_nhwc_to_nchw32_gpu(
+  float * dst, unsigned char * src, int d_w, int d_h, int d_c, int s_w, int s_h, int s_c,
+  float * mean, float * std, cudaStream_t stream);
+
+/**
+ * @brief Optimized preprocessing including resize, letterbox, nhwc2nchw, toFloat and normalization
+ * with batching for YOLOX on gpus
+ * @param[out] dst processed image
+ * @param[in] src image
+ * @param[in] d_w width for output
+ * @param[in] d_h height for output
+ * @param[in] d_c channel for output
+ * @param[in] s_w width for input
+ * @param[in] s_h height for input
+ * @param[in] s_c channel for input
+ * @param[in] batch batch size
+ * @param[in] mean mean values for each channel
+ * @param[in] std std values for each channel
+ * @param[in] stream cuda stream
+ */
+extern void resize_bilinear_letterbox_nhwc_to_nchw32_batch_gpu(
+  float * dst, unsigned char * src, int d_w, int d_h, int d_c, int s_w, int s_h, int s_c, int batch,
+  float * mean, float * std, cudaStream_t stream);
+
+/**
+ * @brief Optimized preprocessing including crop, resize, letterbox, nhwc2nchw, toFloat and
+ * normalization with batching for YOLOX on gpus
+ * @param[out] dst processed image
+ * @param[in] src image
+ * @param[in] d_w width for output
+ * @param[in] d_h height for output
+ * @param[in] d_c channel for output
+ * @param[in] s_w width for input
+ * @param[in] s_h height for input
+ * @param[in] s_c channel for input
+ * @param[in] d_roi regions of interest for cropping
+ * @param[in] batch batch size
+ * @param[in] mean mean values for each channel
+ * @param[in] std std values for each channel
+ * @param[in] stream cuda stream
+ */
+extern void crop_resize_bilinear_letterbox_nhwc_to_nchw32_batch_gpu(
+  float * dst, unsigned char * src, int d_w, int d_h, int d_c, Roi * d_roi, int s_w, int s_h,
+  int s_c, int batch, float * mean, float * std, cudaStream_t stream);
+
+/**
+ * @brief Optimized multi-scale preprocessing including crop, resize, letterbox, nhwc2nchw, toFloat
+ * and normalization with batching for YOLOX on gpus
+ * @param[out] dst processed image
+ * @param[in] src image
+ * @param[in] d_w width for output
+ * @param[in] d_h height for output
+ * @param[in] d_c channel for output
+ * @param[in] s_w width for input
+ * @param[in] s_h height for input
+ * @param[in] s_c channel for input
+ * @param[in] d_roi regions of interest for cropping
+ * @param[in] batch batch size
+ * @param[in] mean mean values for each channel
+ * @param[in] std std values for each channel
+ * @param[in] stream cuda stream
+ */
+extern void multi_scale_resize_bilinear_letterbox_nhwc_to_nchw32_batch_gpu(
+  float * dst, unsigned char * src, int d_w, int d_h, int d_c, Roi * d_roi, int s_w, int s_h,
+  int s_c, int batch, float * mean, float * std, cudaStream_t stream);
+
+/**
+ * @brief Argmax on GPU
+ * @param[out] dst processed image
+ * @param[in] src probability map
+ * @param[in] d_w width for output
+ * @param[in] d_h height for output
+ * @param[in] s_w width for input
+ * @param[in] s_h height for input
+ * @param[in] s_c channel for input
+ * @param[in] batch batch size
+ * @param[in] stream cuda stream
+ */
+extern void argmax_gpu(
+  unsigned char * dst, float * src, int d_w, int d_h, int s_w, int s_h, int s_c, int batch,
+  cudaStream_t stream);
+}  // namespace mmros::process
+#endif  // MMROS__PROCESS__IMAGE_KERNEL_HPP_
diff --git a/mmros/src/detector/detector2d.cpp b/mmros/src/detector/detector2d.cpp
index 76312de..f330070 100644
--- a/mmros/src/detector/detector2d.cpp
+++ b/mmros/src/detector/detector2d.cpp
@@ -20,18 +20,13 @@
 #include "mmros/detector/utility.hpp"
 #include "mmros/process/image.hpp"
 #include "mmros/tensorrt/cuda_check_error.hpp"
-#include "mmros/tensorrt/cuda_unique_ptr.hpp"
 #include "mmros/tensorrt/utility.hpp"
 
 #include <NvInferRuntimeBase.h>
 
-#include <algorithm>
-#include <cstddef>
-#include <cstdint>
 #include <functional>
 #include <memory>
 #include <numeric>
-#include <optional>
 #include <sstream>
 #include <utility>
 #include <vector>
@@ -153,41 +148,9 @@ void Detector2D::initCudaPtr(size_t batch_size)
 /// Execute preprocess.
 void Detector2D::preprocess(const std::vector<cv::Mat> & images)
 {
-  // (B, C, H, W)
-  const auto batch_size = images.size();
-  auto in_dims = trt_common_->getTensorShape(0);
-
-  cuda::CudaUniquePtrHost<unsigned char[]> img_buf_h;
-  cuda::CudaUniquePtr<unsigned char[]> img_buf_d;
-
-  scales_.clear();
-  for (auto b = 0; b < images.size(); ++b) {
-    const auto & img = images.at(b);
-    if (!img_buf_h) {
-      img_buf_h = cuda::make_unique_host<unsigned char[]>(
-        img.cols * img.rows * 3 * batch_size, cudaHostAllocWriteCombined);
-      img_buf_d = cuda::make_unique<unsigned char[]>(img.cols * img.rows * 3 * batch_size);
-    }
-    const float scale =
-      std::min(static_cast<float>(in_width_) / img.cols, static_cast<float>(in_height_) / img.rows);
-    scales_.emplace_back(scale);
-
-    int index = b * img.cols * img.rows * 3;
-    // Copy into pinned memory
-    memcpy(img_buf_h.get() + index, &img.data[0], img.cols * img.rows * 3 * sizeof(unsigned char));
-  }
-
-  CHECK_CUDA_ERROR(
-    ::cudaMemcpyAsync(
-      img_buf_d.get(), img_buf_h.get(),
-      images[0].cols * images[0].rows * 3 * batch_size * sizeof(unsigned char),
-      ::cudaMemcpyHostToDevice, stream_));
-
-  process::resize_bilinear_letterbox_nhwc_to_nchw32_batch_gpu(
-    input_d_.get(), img_buf_d.get(), in_width_, in_height_, 3, images[0].cols, images[0].rows, 3,
-    batch_size, detector_config_->mean.get(), detector_config_->std.get(), stream_);
-
-  CHECK_CUDA_ERROR(cudaGetLastError());
+  process::preprocess_image(
+    input_d_.get(), scales_, images, in_width_, in_height_, detector_config_->mean.get(),
+    detector_config_->std.get(), stream_);
 }
 
 /// Execute postprocess
diff --git a/mmros/src/detector/instance_segmeter2d.cpp b/mmros/src/detector/instance_segmeter2d.cpp
index a6bbf64..6c66833 100644
--- a/mmros/src/detector/instance_segmeter2d.cpp
+++ b/mmros/src/detector/instance_segmeter2d.cpp
@@ -19,20 +19,13 @@
 #include "mmros/detector/utility.hpp"
 #include "mmros/process/image.hpp"
 #include "mmros/tensorrt/cuda_check_error.hpp"
-#include "mmros/tensorrt/cuda_unique_ptr.hpp"
-#include "mmros/tensorrt/tensorrt_common.hpp"
 #include "mmros/tensorrt/utility.hpp"
 
 #include <opencv2/core.hpp>
-#include <opencv2/core/mat.hpp>
-#include <opencv2/core/types.hpp>
 #include <opencv2/imgproc.hpp>
 
 #include <NvInferRuntimeBase.h>
 
-#include <algorithm>
-#include <cstddef>
-#include <cstring>
 #include <functional>
 #include <memory>
 #include <numeric>
@@ -160,41 +153,9 @@ void InstanceSegmenter2D::initCudaPtr(size_t batch_size)
 
 void InstanceSegmenter2D::preprocess(const std::vector<cv::Mat> & images)
 {
-  // (B, C, H, W)
-  const auto batch_size = images.size();
-  auto in_dims = trt_common_->getInputDims(0);
-
-  cuda::CudaUniquePtrHost<unsigned char[]> img_buf_h;
-  cuda::CudaUniquePtr<unsigned char[]> img_buf_d;
-
-  scales_.clear();
-  for (auto b = 0; b < images.size(); ++b) {
-    const auto & img = images.at(b);
-    if (!img_buf_h) {
-      img_buf_h = cuda::make_unique_host<unsigned char[]>(
-        img.cols * img.rows * 3 * batch_size, cudaHostAllocWriteCombined);
-      img_buf_d = cuda::make_unique<unsigned char[]>(img.cols * img.rows * 3 * batch_size);
-    }
-    const float scale =
-      std::min(static_cast<float>(in_width_) / img.cols, static_cast<float>(in_height_) / img.rows);
-    scales_.emplace_back(scale);
-
-    int index = b * img.cols * img.rows * 3;
-    // Copy into pinned memory
-    memcpy(img_buf_h.get() + index, &img.data[0], img.cols * img.rows * 3 * sizeof(unsigned char));
-  }
-
-  CHECK_CUDA_ERROR(
-    ::cudaMemcpyAsync(
-      img_buf_d.get(), img_buf_h.get(),
-      images[0].cols * images[0].rows * 3 * batch_size * sizeof(unsigned char),
-      ::cudaMemcpyHostToDevice, stream_));
-
-  process::resize_bilinear_letterbox_nhwc_to_nchw32_batch_gpu(
-    input_d_.get(), img_buf_d.get(), in_width_, in_height_, 3, images[0].cols, images[0].rows, 3,
-    batch_size, detector_config_->mean.get(), detector_config_->std.get(), stream_);
-
-  CHECK_CUDA_ERROR(cudaGetLastError());
+  process::preprocess_image(
+    input_d_.get(), scales_, images, in_width_, in_height_, detector_config_->mean.get(),
+    detector_config_->std.get(), stream_);
 }
 
 archetype::Result<outputs_type> InstanceSegmenter2D::postprocess(
diff --git a/mmros/src/detector/panoptic_segmenter2d.cpp b/mmros/src/detector/panoptic_segmenter2d.cpp
index c05e642..2f8a3c6 100644
--- a/mmros/src/detector/panoptic_segmenter2d.cpp
+++ b/mmros/src/detector/panoptic_segmenter2d.cpp
@@ -20,17 +20,12 @@
 #include "mmros/detector/utility.hpp"
 #include "mmros/process/image.hpp"
 #include "mmros/tensorrt/cuda_check_error.hpp"
-#include "mmros/tensorrt/cuda_unique_ptr.hpp"
 
 #include <opencv2/core/mat.hpp>
 
-#include <algorithm>
-#include <cstddef>
-#include <cstring>
 #include <functional>
 #include <memory>
 #include <numeric>
-#include <optional>
 #include <utility>
 #include <vector>
 
@@ -168,41 +163,9 @@ void PanopticSegmenter2D::initCudaPtr(size_t batch_size)
 
 void PanopticSegmenter2D::preprocess(const std::vector<cv::Mat> & images)
 {
-  // (B, C, H, W)
-  const auto batch_size = images.size();
-  auto in_dims = trt_common_->getTensorShape(0);
-
-  cuda::CudaUniquePtrHost<unsigned char[]> img_buf_h;
-  cuda::CudaUniquePtr<unsigned char[]> img_buf_d;
-
-  scales_.clear();
-  for (auto b = 0; b < images.size(); ++b) {
-    const auto & img = images.at(b);
-    if (!img_buf_h) {
-      img_buf_h = cuda::make_unique_host<unsigned char[]>(
-        img.cols * img.rows * 3 * batch_size, cudaHostAllocWriteCombined);
-      img_buf_d = cuda::make_unique<unsigned char[]>(img.cols * img.rows * 3 * batch_size);
-    }
-    const float scale =
-      std::min(static_cast<float>(in_width_) / img.cols, static_cast<float>(in_height_) / img.rows);
-    scales_.emplace_back(scale);
-
-    int index = b * img.cols * img.rows * 3;
-    // Copy into pinned memory
-    memcpy(img_buf_h.get() + index, &img.data[0], img.cols * img.rows * 3 * sizeof(unsigned char));
-  }
-
-  CHECK_CUDA_ERROR(
-    ::cudaMemcpyAsync(
-      img_buf_d.get(), img_buf_h.get(),
-      images[0].cols * images[0].rows * 3 * batch_size * sizeof(unsigned char),
-      ::cudaMemcpyHostToDevice, stream_));
-
-  process::resize_bilinear_letterbox_nhwc_to_nchw32_batch_gpu(
-    input_d_.get(), img_buf_d.get(), in_width_, in_height_, 3, images[0].cols, images[0].rows, 3,
-    batch_size, detector_config_->mean.get(), detector_config_->std.get(), stream_);
-
-  CHECK_CUDA_ERROR(cudaGetLastError());
+  process::preprocess_image(
+    input_d_.get(), scales_, images, in_width_, in_height_, detector_config_->mean.get(),
+    detector_config_->std.get(), stream_);
 }
 
 archetype::Result<outputs_type> PanopticSegmenter2D::postprocess(
diff --git a/mmros/src/detector/semantic_segmenter2d.cpp b/mmros/src/detector/semantic_segmenter2d.cpp
index 4de7d67..bf184f9 100644
--- a/mmros/src/detector/semantic_segmenter2d.cpp
+++ b/mmros/src/detector/semantic_segmenter2d.cpp
@@ -18,7 +18,6 @@
 #include "mmros/archetype/result.hpp"
 #include "mmros/process/image.hpp"
 #include "mmros/tensorrt/cuda_check_error.hpp"
-#include "mmros/tensorrt/cuda_unique_ptr.hpp"
 
 #include <opencv2/core/mat.hpp>
 #include <opencv2/imgproc.hpp>
@@ -26,13 +25,9 @@
 #include <NvInferRuntime.h>
 
 #include <algorithm>
-#include <cstddef>
-#include <cstring>
 #include <functional>
 #include <memory>
 #include <numeric>
-#include <optional>
-#include <stdexcept>
 #include <utility>
 #include <vector>
 
@@ -144,41 +139,9 @@ void SemanticSegmenter2D::initCudaPtr(size_t batch_size)
 
 void SemanticSegmenter2D::preprocess(const std::vector<cv::Mat> & images)
 {
-  // (B, C, H, W)
-  const auto batch_size = images.size();
-  auto in_dims = trt_common_->getTensorShape(0);
-
-  cuda::CudaUniquePtrHost<unsigned char[]> img_buf_h;
-  cuda::CudaUniquePtr<unsigned char[]> img_buf_d;
-
-  scales_.clear();
-  for (auto b = 0; b < images.size(); ++b) {
-    const auto & img = images.at(b);
-    if (!img_buf_h) {
-      img_buf_h = cuda::make_unique_host<unsigned char[]>(
-        img.cols * img.rows * 3 * batch_size, cudaHostAllocWriteCombined);
-      img_buf_d = cuda::make_unique<unsigned char[]>(img.cols * img.rows * 3 * batch_size);
-    }
-    const float scale =
-      std::min(static_cast<float>(in_width_) / img.cols, static_cast<float>(in_height_) / img.rows);
-    scales_.emplace_back(scale);
-
-    int index = b * img.cols * img.rows * 3;
-    // Copy into pinned memory
-    memcpy(img_buf_h.get() + index, &img.data[0], img.cols * img.rows * 3 * sizeof(unsigned char));
-  }
-
-  CHECK_CUDA_ERROR(
-    ::cudaMemcpyAsync(
-      img_buf_d.get(), img_buf_h.get(),
-      images[0].cols * images[0].rows * 3 * batch_size * sizeof(unsigned char),
-      ::cudaMemcpyHostToDevice, stream_));
-
-  process::resize_bilinear_letterbox_nhwc_to_nchw32_batch_gpu(
-    input_d_.get(), img_buf_d.get(), in_width_, in_height_, 3, images[0].cols, images[0].rows, 3,
-    batch_size, detector_config_->mean.get(), detector_config_->std.get(), stream_);
-
-  CHECK_CUDA_ERROR(cudaGetLastError());
+  process::preprocess_image(
+    input_d_.get(), scales_, images, in_width_, in_height_, detector_config_->mean.get(),
+    detector_config_->std.get(), stream_);
 }
 
 archetype::Result<outputs_type> SemanticSegmenter2D::postprocess(
diff --git a/mmros/src/process/image.cpp b/mmros/src/process/image.cpp
new file mode 100644
index 0000000..5f1dba8
--- /dev/null
+++ b/mmros/src/process/image.cpp
@@ -0,0 +1,63 @@
+// Copyright 2025 Kotaro Uetake.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "mmros/process/image.hpp"
+
+#include "mmros/tensorrt/cuda_unique_ptr.hpp"
+
+#include <algorithm>
+#include <vector>
+
+namespace mmros::process
+{
+void preprocess_image(
+  float * input_d, std::vector<float> & scales, const std::vector<cv::Mat> & images,
+  int64_t in_width, int64_t in_height, float * mean, float * std, cudaStream_t stream)
+{
+  // (B, C, H, W)
+  const auto batch_size = images.size();
+
+  cuda::CudaUniquePtrHost<unsigned char[]> img_buf_h;
+  cuda::CudaUniquePtr<unsigned char[]> img_buf_d;
+
+  scales.clear();
+  for (size_t b = 0; b < images.size(); ++b) {
+    const auto & img = images.at(b);
+    if (!img_buf_h) {
+      img_buf_h = cuda::make_unique_host<unsigned char[]>(
+        img.cols * img.rows * 3 * batch_size, cudaHostAllocWriteCombined);
+      img_buf_d = cuda::make_unique<unsigned char[]>(img.cols * img.rows * 3 * batch_size);
+    }
+    const float scale =
+      std::min(static_cast<float>(in_width) / img.cols, static_cast<float>(in_height) / img.rows);
+    scales.emplace_back(scale);
+
+    size_t index = b * img.cols * img.rows * 3;
+    // Copy into pinned memory
+    memcpy(img_buf_h.get() + index, &img.data[0], img.cols * img.rows * 3 * sizeof(unsigned char));
+  }
+
+  CHECK_CUDA_ERROR(
+    ::cudaMemcpyAsync(
+      img_buf_d.get(), img_buf_h.get(),
+      images[0].cols * images[0].rows * 3 * batch_size * sizeof(unsigned char),
+      ::cudaMemcpyHostToDevice, stream));
+
+  process::resize_bilinear_letterbox_nhwc_to_nchw32_batch_gpu(
+    input_d, img_buf_d.get(), in_width, in_height, 3, images[0].cols, images[0].rows, 3, batch_size,
+    mean, std, stream);
+
+  CHECK_CUDA_ERROR(cudaGetLastError());
+}
+}  // namespace mmros::process
diff --git a/mmros/src/process/image.cu b/mmros/src/process/image_kernel.cu
similarity index 99%
rename from mmros/src/process/image.cu
rename to mmros/src/process/image_kernel.cu
index 24c41ee..dd5e296 100644
--- a/mmros/src/process/image.cu
+++ b/mmros/src/process/image_kernel.cu
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "mmros/process/image.hpp"
+#include "mmros/process/image_kernel.hpp"
 
 #include <stdio.h>
 #include <stdlib.h>