Skip to content

Commit 824c514

Browse files
committed
abstract the partially pinned buffers a bit
the commit 953442a (with further changes/fixes)
1 parent 7fff1c7 commit 824c514

File tree

4 files changed

+40
-21
lines changed

4 files changed

+40
-21
lines changed

src/gpujpeg_common.c

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2204,4 +2204,31 @@ format_number_with_delim(size_t num, char* buf, size_t buflen)
22042204
return ptr;
22052205
}
22062206

2207+
/**
2208+
* @brief tweaked cudaMemcpyAsync alternative allowing partially-pinned buffers
2209+
*
2210+
* Some buffers are only partially pinned (currently gpujpeg_coder::data_compressed) for 2 reason:
2211+
* - to speed up the initialization - allcating huge pinned buffers (cudaHostMalloc) takes noticable amount of time
2212+
* - if not used, which is the vast majority of size, it still ocuppies the allocated amount of _physical_ memory
2213+
*
2214+
* This solution has an unfortunate drawback that cudaMemcpyAsync cannot be performend across the pinned and non-pinned
2215+
* boundary. So in the (perhaps rare) case when the size is higher than pinned_sz, 2x memcpy must be used.
2216+
*
2217+
* @sa gpujpeg_coder_init_image (cudaHostRegister)
2218+
* @sa gpujpeg_coder_deinit (cudaHostUnregister)
2219+
*/
2220+
cudaError_t
2221+
gpujpeg_cuda_memcpy_async_partially_pinned(void* dst, const void* src, size_t count, enum cudaMemcpyKind kind,
2222+
cudaStream_t stream, size_t pinned_sz)
2223+
{
2224+
cudaError_t err = cudaMemcpyAsync(dst, src, MIN(count, pinned_sz), kind, stream);
2225+
if ( err != cudaSuccess ) {
2226+
return err;
2227+
}
2228+
if ( count > pinned_sz ) {
2229+
err = cudaMemcpyAsync((uint8_t*)dst + pinned_sz, (uint8_t*)src + pinned_sz, count - pinned_sz, kind, stream);
2230+
}
2231+
return err;
2232+
}
2233+
22072234
/* vi: set expandtab sw=4 : */

src/gpujpeg_common_internal.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -373,7 +373,7 @@ struct gpujpeg_coder
373373
size_t data_allocated_size;
374374

375375
/// Huffman coder data in host memory (output/input for encoder/decoder)
376-
/// only **partially** pinned (needs special treatment - @sa data_compressed_pinned_sz occurrences)
376+
/// only **partially** pinned (needs special treatment - @sa gpujpeg_cuda_memcpy_async_partially_pinned)
377377
uint8_t* data_compressed;
378378
size_t data_compressed_pinned_sz; ///< amount of pinned memory from data_compressed
379379
/// Huffman coder data in device memory (output/input for encoder/decoder)
@@ -516,6 +516,9 @@ gpujpeg_make_sampling_factor(int comp_count, int comp1_h, int comp1_v, int comp2
516516
(sampling_factor)[2].horizontal, (sampling_factor)[2].vertical, \
517517
(sampling_factor)[3].horizontal, (sampling_factor)[3].vertical)
518518

519+
cudaError_t
520+
gpujpeg_cuda_memcpy_async_partially_pinned(void* dst, const void* src, size_t count, enum cudaMemcpyKind kind,
521+
cudaStream_t stream, size_t pinned_sz);
519522

520523
#ifdef __cplusplus
521524
} // extern "C"

src/gpujpeg_decoder.c

Lines changed: 4 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -297,17 +297,10 @@ gpujpeg_decoder_decode(struct gpujpeg_decoder* decoder, uint8_t* image, size_t i
297297
cudaMemsetAsync(coder->d_data_quantized, 0, coder->data_size * sizeof(int16_t), decoder->stream);
298298

299299
// Copy scan data to device memory
300-
cudaMemcpyAsync(coder->d_data_compressed, coder->data_compressed,
301-
MIN(decoder->data_compressed_size * sizeof(uint8_t), coder->data_compressed_pinned_sz),
302-
cudaMemcpyHostToDevice, decoder->stream);
303-
gpujpeg_cuda_check_error("Decoder copy compressed data to pinned memory", return -1);
304-
if ( decoder->data_compressed_size * sizeof(uint8_t) > coder->data_compressed_pinned_sz ) {
305-
cudaMemcpyAsync(coder->d_data_compressed + coder->data_compressed_pinned_sz,
306-
coder->data_compressed + coder->data_compressed_pinned_sz,
307-
decoder->data_compressed_size - coder->data_compressed_pinned_sz, cudaMemcpyHostToDevice,
308-
decoder->stream);
309-
gpujpeg_cuda_check_error("Decoder copy compressed data to pageable memory", return -1);
310-
}
300+
gpujpeg_cuda_memcpy_async_partially_pinned(coder->d_data_compressed, coder->data_compressed, decoder->data_compressed_size,
301+
cudaMemcpyHostToDevice, decoder->stream,
302+
coder->data_compressed_pinned_sz);
303+
gpujpeg_cuda_check_error("Decoder copy compressed data to memory", return -1);
311304

312305
// Copy segments to device memory
313306
cudaMemcpyAsync(coder->d_segment, coder->segment, decoder->segment_count * sizeof(struct gpujpeg_segment), cudaMemcpyHostToDevice, decoder->stream);

src/gpujpeg_encoder.c

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -536,15 +536,11 @@ gpujpeg_encoder_encode(struct gpujpeg_encoder* encoder, const struct gpujpeg_par
536536
GPUJPEG_CUSTOM_TIMER_START(coder->duration_memory_from, coder->param.perf_stats, encoder->stream, return -1);
537537

538538
// Copy compressed data from device memory to cpu memory
539-
cudaMemcpyAsync(coder->data_compressed, coder->d_data_compressed,
540-
MIN(output_size, coder->data_compressed_pinned_sz), cudaMemcpyDeviceToHost, encoder->stream);
541-
gpujpeg_cuda_check_error("Encoder copy compressed data to pinned memory", return -1);
542-
if ( output_size > coder->data_compressed_pinned_sz ) {
543-
cudaMemcpyAsync(coder->data_compressed + coder->data_compressed_pinned_sz,
544-
coder->d_data_compressed + coder->data_compressed_pinned_sz,
545-
output_size - coder->data_compressed_pinned_sz, cudaMemcpyDeviceToHost, encoder->stream);
546-
gpujpeg_cuda_check_error("Encoder copy compressed data to pageable emory", return -1);
547-
}
539+
gpujpeg_cuda_memcpy_async_partially_pinned(coder->data_compressed, coder->d_data_compressed, output_size,
540+
cudaMemcpyDeviceToHost, encoder->stream,
541+
coder->data_compressed_pinned_sz);
542+
gpujpeg_cuda_check_error("Encoder copy compressed data to memory", return -1);
543+
548544
// Copy segments from device memory
549545
if ( cudaSuccess != cudaMemcpyAsync(coder->segment, coder->d_segment, coder->segment_count * sizeof(struct gpujpeg_segment), cudaMemcpyDeviceToHost, encoder->stream) ) {
550546
return -1;

0 commit comments

Comments
 (0)