abstract the partially pinned buffers a bit

MartinPulec · MartinPulec · commit 824c5146907d · 2025-04-30T15:27:14.000+02:00
the commit 953442a (with further changes/fixes)
diff --git a/src/gpujpeg_common.c b/src/gpujpeg_common.c
@@ -2204,4 +2204,31 @@ format_number_with_delim(size_t num, char* buf, size_t buflen)
     return ptr;
 }
 
+/**
+ * @brief tweaked cudaMemcpyAsync alternative  allowing partially-pinned buffers
+ *
+ * Some buffers are only partially pinned (currently gpujpeg_coder::data_compressed) for 2 reason:
+ * - to speed up the initialization - allcating huge pinned buffers (cudaHostMalloc) takes noticable amount of time
+ * - if not used, which is the vast majority of size, it still ocuppies the allocated amount of _physical_ memory
+ *
+ * This solution has an unfortunate drawback that cudaMemcpyAsync cannot be performend across the pinned and non-pinned
+ * boundary. So in the (perhaps rare) case when the size is higher than pinned_sz, 2x memcpy must be used.
+ *
+ * @sa gpujpeg_coder_init_image (cudaHostRegister)
+ * @sa gpujpeg_coder_deinit (cudaHostUnregister)
+ */
+cudaError_t
+gpujpeg_cuda_memcpy_async_partially_pinned(void* dst, const void* src, size_t count, enum cudaMemcpyKind kind,
+                                            cudaStream_t stream, size_t pinned_sz)
+{
+    cudaError_t err = cudaMemcpyAsync(dst, src, MIN(count, pinned_sz), kind, stream);
+    if ( err != cudaSuccess ) {
+        return err;
+    }
+    if ( count > pinned_sz ) {
+        err = cudaMemcpyAsync((uint8_t*)dst + pinned_sz, (uint8_t*)src + pinned_sz, count - pinned_sz, kind, stream);
+    }
+    return err;
+}
+
 /* vi: set expandtab sw=4 : */
diff --git a/src/gpujpeg_common_internal.h b/src/gpujpeg_common_internal.h
@@ -373,7 +373,7 @@ struct gpujpeg_coder
     size_t data_allocated_size;
 
     /// Huffman coder data in host memory (output/input for encoder/decoder)
-    /// only **partially** pinned (needs special treatment - @sa data_compressed_pinned_sz occurrences)
+    /// only **partially** pinned (needs special treatment - @sa gpujpeg_cuda_memcpy_async_partially_pinned)
     uint8_t* data_compressed;
     size_t data_compressed_pinned_sz; ///< amount of pinned memory from data_compressed
     /// Huffman coder data in device memory (output/input for encoder/decoder)
@@ -516,6 +516,9 @@ gpujpeg_make_sampling_factor(int comp_count, int comp1_h, int comp1_v, int comp2
                                  (sampling_factor)[2].horizontal, (sampling_factor)[2].vertical,                       \
                                  (sampling_factor)[3].horizontal, (sampling_factor)[3].vertical)
 
+cudaError_t
+gpujpeg_cuda_memcpy_async_partially_pinned(void* dst, const void* src, size_t count, enum cudaMemcpyKind kind,
+                                           cudaStream_t stream, size_t pinned_sz);
 
 #ifdef __cplusplus
 } // extern "C"
diff --git a/src/gpujpeg_decoder.c b/src/gpujpeg_decoder.c
@@ -297,17 +297,10 @@ gpujpeg_decoder_decode(struct gpujpeg_decoder* decoder, uint8_t* image, size_t i
         cudaMemsetAsync(coder->d_data_quantized, 0, coder->data_size * sizeof(int16_t), decoder->stream);
 
         // Copy scan data to device memory
-        cudaMemcpyAsync(coder->d_data_compressed, coder->data_compressed,
-                        MIN(decoder->data_compressed_size * sizeof(uint8_t), coder->data_compressed_pinned_sz),
-                        cudaMemcpyHostToDevice, decoder->stream);
-        gpujpeg_cuda_check_error("Decoder copy compressed data to pinned memory", return -1);
-        if ( decoder->data_compressed_size * sizeof(uint8_t) > coder->data_compressed_pinned_sz ) {
-            cudaMemcpyAsync(coder->d_data_compressed + coder->data_compressed_pinned_sz,
-                            coder->data_compressed + coder->data_compressed_pinned_sz,
-                            decoder->data_compressed_size - coder->data_compressed_pinned_sz, cudaMemcpyHostToDevice,
-                            decoder->stream);
-            gpujpeg_cuda_check_error("Decoder copy compressed data to pageable memory", return -1);
-        }
+        gpujpeg_cuda_memcpy_async_partially_pinned(coder->d_data_compressed, coder->data_compressed, decoder->data_compressed_size,
+                                                   cudaMemcpyHostToDevice, decoder->stream,
+                                                   coder->data_compressed_pinned_sz);
+        gpujpeg_cuda_check_error("Decoder copy compressed data to memory", return -1);
 
         // Copy segments to device memory
         cudaMemcpyAsync(coder->d_segment, coder->segment, decoder->segment_count * sizeof(struct gpujpeg_segment), cudaMemcpyHostToDevice, decoder->stream);
diff --git a/src/gpujpeg_encoder.c b/src/gpujpeg_encoder.c
@@ -536,15 +536,11 @@ gpujpeg_encoder_encode(struct gpujpeg_encoder* encoder, const struct gpujpeg_par
         GPUJPEG_CUSTOM_TIMER_START(coder->duration_memory_from, coder->param.perf_stats, encoder->stream, return -1);
 
         // Copy compressed data from device memory to cpu memory
-        cudaMemcpyAsync(coder->data_compressed, coder->d_data_compressed,
-                        MIN(output_size, coder->data_compressed_pinned_sz), cudaMemcpyDeviceToHost, encoder->stream);
-        gpujpeg_cuda_check_error("Encoder copy compressed data to pinned memory", return -1);
-        if ( output_size > coder->data_compressed_pinned_sz ) {
-            cudaMemcpyAsync(coder->data_compressed + coder->data_compressed_pinned_sz,
-                            coder->d_data_compressed + coder->data_compressed_pinned_sz,
-                            output_size - coder->data_compressed_pinned_sz, cudaMemcpyDeviceToHost, encoder->stream);
-            gpujpeg_cuda_check_error("Encoder copy compressed data to pageable emory", return -1);
-        }
+        gpujpeg_cuda_memcpy_async_partially_pinned(coder->data_compressed, coder->d_data_compressed, output_size,
+                                                   cudaMemcpyDeviceToHost, encoder->stream,
+                                                   coder->data_compressed_pinned_sz);
+        gpujpeg_cuda_check_error("Encoder copy compressed data to memory", return -1);
+
         // Copy segments from device memory
         if ( cudaSuccess != cudaMemcpyAsync(coder->segment, coder->d_segment, coder->segment_count * sizeof(struct gpujpeg_segment), cudaMemcpyDeviceToHost, encoder->stream) ) {
             return -1;