@@ -2535,6 +2535,7 @@ impl CudaStream {
25352535
25362536#[ cfg( test) ]
25372537mod tests {
2538+ use std:: hint:: black_box;
25382539 use std:: time:: Instant ;
25392540
25402541 use super :: * ;
@@ -2716,6 +2717,44 @@ mod tests {
27162717 assert_eq ! ( & host, & truth) ;
27172718 }
27182719
2720+ #[ test]
2721+ fn test_default_pinned_host_reads_are_faster_than_write_combined ( ) {
2722+ fn timed_host_reads ( values : & [ u32 ] , n_samples : usize ) -> ( std:: time:: Duration , u64 ) {
2723+ let start = Instant :: now ( ) ;
2724+ let mut sum = 0_u64 ;
2725+ for _ in 0 ..n_samples {
2726+ for value in black_box ( values) {
2727+ sum = sum. wrapping_add ( u64:: from ( * value) ) ;
2728+ }
2729+ }
2730+ ( start. elapsed ( ) , black_box ( sum) )
2731+ }
2732+
2733+ let ctx = CudaContext :: new ( 0 ) . unwrap ( ) ;
2734+ let n = 1 << 20 ;
2735+ let n_samples = 5 ;
2736+ let mut write_combined =
2737+ unsafe { ctx. alloc_pinned_with_flags :: < u32 > ( n, sys:: CU_MEMHOSTALLOC_WRITECOMBINED ) }
2738+ . unwrap ( ) ;
2739+ let mut default = unsafe { ctx. alloc_pinned_with_flags :: < u32 > ( n, 0 ) } . unwrap ( ) ;
2740+ write_combined. as_mut_slice ( ) . unwrap ( ) . fill ( 1 ) ;
2741+ default. as_mut_slice ( ) . unwrap ( ) . fill ( 1 ) ;
2742+
2743+ let ( write_combined_elapsed, write_combined_sum) =
2744+ timed_host_reads ( write_combined. as_slice ( ) . unwrap ( ) , n_samples) ;
2745+ let ( default_elapsed, default_sum) =
2746+ timed_host_reads ( default. as_slice ( ) . unwrap ( ) , n_samples) ;
2747+ assert_eq ! ( write_combined_sum, default_sum) ;
2748+ std:: println!(
2749+ "default pinned host reads: {default_elapsed:?}; write-combined host reads: {write_combined_elapsed:?}"
2750+ ) ;
2751+ // The performance gap should be large, but leave margin for device and host variance.
2752+ assert ! (
2753+ default_elapsed. as_secs_f32( ) * 2.0 < write_combined_elapsed. as_secs_f32( ) ,
2754+ "{default_elapsed:?} vs {write_combined_elapsed:?}"
2755+ ) ;
2756+ }
2757+
27192758 #[ test]
27202759 fn test_pinned_copy_is_faster ( ) {
27212760 let ctx = CudaContext :: new ( 0 ) . unwrap ( ) ;
0 commit comments