1- use hypercube_alloc:: { buffer , Buffer , CpuBackend } ;
1+ use hypercube_alloc:: { Buffer , CpuBackend } ;
22use hypercube_tensor:: { Dimensions , Tensor } ;
33use p3_field:: { AbstractExtensionField , AbstractField } ;
44use rayon:: prelude:: * ;
5+ use std:: sync:: { Arc , Mutex } ;
56
67use crate :: { partial_lagrange_blocking, Point } ;
78
@@ -16,22 +17,70 @@ pub(crate) fn eval_mle_at_point_blocking<
1617 let mut sizes = mle. sizes ( ) . to_vec ( ) ;
1718 sizes. remove ( 0 ) ;
1819 let dimensions = Dimensions :: try_from ( sizes) . unwrap ( ) ;
19- let mut dst = Tensor { storage : buffer ! [ ] , dimensions } ;
20- let total_len = dst. total_len ( ) ;
21- let dot_products = mle
22- . as_buffer ( )
20+ let total_len = dimensions. total_len ( ) ;
21+
22+ // Pre-allocation of the result buffer
23+ let result = Arc :: new ( Mutex :: new ( vec ! [ EF :: zero( ) ; total_len] ) ) ;
24+
25+ // Process in parallel using Rayon
26+ mle. as_buffer ( )
2327 . par_chunks_exact ( mle. strides ( ) [ 0 ] )
2428 . zip ( partial_lagrange. as_buffer ( ) . par_iter ( ) )
25- . map ( |( chunk, scalar) | chunk. iter ( ) . map ( |a| scalar. clone ( ) * a. clone ( ) ) . collect ( ) )
26- . reduce (
27- || vec ! [ EF :: zero( ) ; total_len] ,
28- |mut a, b| {
29- a. iter_mut ( ) . zip ( b. iter ( ) ) . for_each ( |( a, b) | * a += b. clone ( ) ) ;
30- a
31- } ,
32- ) ;
29+ . for_each ( |( chunk, scalar) | {
30+ // Process each chunk with a thread-local accumulator
31+ let mut local_result = vec ! [ EF :: zero( ) ; total_len] ;
32+
33+ // Avoid allocation in the inner loop
34+ for ( i, a) in chunk. iter ( ) . enumerate ( ) {
35+ if i < total_len {
36+ // Compute scalar * a directly into the accumulator
37+ local_result[ i] = scalar. clone ( ) * a. clone ( ) ;
38+ }
39+ }
40+
41+ // Update the global result with our local computation
42+ let result_clone = Arc :: clone ( & result) ;
43+ let mut global_result = result_clone. lock ( ) . unwrap ( ) ;
44+ for i in 0 ..total_len {
45+ global_result[ i] += local_result[ i] . clone ( ) ;
46+ }
47+ } ) ;
3348
34- let dot_products = Buffer :: from ( dot_products ) ;
35- dst . storage = dot_products ;
36- dst
49+ // Create the final tensor
50+ let result_buffer = Buffer :: from ( Arc :: try_unwrap ( result ) . unwrap ( ) . into_inner ( ) . unwrap ( ) ) ;
51+ Tensor { storage : result_buffer , dimensions }
3752}
53+
54+ // Add a specialized implementation for the case when the number of polynomials is small
55+ pub ( crate ) fn eval_mle_at_point_small_batch <
56+ F : AbstractField + Sync ,
57+ EF : AbstractExtensionField < F > + Send + Sync ,
58+ > (
59+ mle : & Tensor < F , CpuBackend > ,
60+ point : & Point < EF , CpuBackend > ,
61+ ) -> Tensor < EF , CpuBackend > {
62+ // For small batches (fewer than 4 polynomials), use a different approach
63+ // that avoids the overhead of parallelization
64+ let partial_lagrange = partial_lagrange_blocking ( point) ;
65+ let mut sizes = mle. sizes ( ) . to_vec ( ) ;
66+ sizes. remove ( 0 ) ;
67+ let dimensions = Dimensions :: try_from ( sizes) . unwrap ( ) ;
68+ let total_len = dimensions. total_len ( ) ;
69+
70+ // Direct computation without parallelization for small batches
71+ let mut result = vec ! [ EF :: zero( ) ; total_len] ;
72+
73+ for ( chunk, scalar) in mle. as_buffer ( )
74+ . chunks_exact ( mle. strides ( ) [ 0 ] )
75+ . zip ( partial_lagrange. as_buffer ( ) . iter ( ) )
76+ {
77+ for ( i, a) in chunk. iter ( ) . enumerate ( ) {
78+ if i < total_len {
79+ result[ i] += scalar. clone ( ) * a. clone ( ) ;
80+ }
81+ }
82+ }
83+
84+ let result_buffer = Buffer :: from ( result) ;
85+ Tensor { storage : result_buffer, dimensions }
86+ }
0 commit comments