@@ -412,36 +412,29 @@ void Input::cloneBlobIfRequired() {
412412 static const size_t batch_size = 2048 ;
413413 const size_t iterations_num = size / batch_size + 1 ;
414414
415- volatile bool has_subnormals_local = false ;
416- volatile bool has_bf16_overflows_local = false ;
417- if (needFlushDenormalsToZero) {
415+ std::atomic< bool > has_subnormals_local ( false ) ;
416+ std::atomic< bool > has_bf16_overflows_local ( false ) ;
417+ if (needFlushDenormalsToZero || do_bf16_saturation_check ) {
418418 parallel_for (iterations_num, [&](int n) {
419419 auto ptr = u32data + n * batch_size;
420- const jit_has_special_value_base::args_t args1 = {
421- reinterpret_cast < float const *>( ptr),
422- std::min (batch_size, ( size_t )(u32data + size - ptr)),
423- false };
424-
425- fn (&args1 );
426-
427- if (args1. hasTargetValues ) {
428- has_subnormals_local = true ;
420+ jit_has_special_value_base::args_t args = {reinterpret_cast < float const *>(ptr),
421+ std::min (batch_size, ( size_t )(u32data + size - ptr) ),
422+ false };
423+
424+ if (needFlushDenormalsToZero && !has_subnormals_local) {
425+ fn (&args );
426+ if (args. hasTargetValues ) {
427+ has_subnormals_local = true ;
428+ }
429429 }
430- });
431- }
432-
433- if (do_bf16_saturation_check) {
434- parallel_for (iterations_num, [&](int n) {
435- auto ptr2 = f32data + n * batch_size;
436- const jit_has_special_value_base::args_t args2 = {
437- reinterpret_cast <float const *>(ptr2),
438- std::min (batch_size, (size_t )(f32data + size - ptr2)),
439- false };
440-
441- fn_bf16_check (&args2);
442430
443- if (args2.hasTargetValues ) {
444- has_bf16_overflows_local = true ;
431+ if (do_bf16_saturation_check && !has_bf16_overflows_local) {
432+ // batch_size is small enough, so source data are still cache-hot
433+ args.hasTargetValues = false ;
434+ fn_bf16_check (&args);
435+ if (args.hasTargetValues ) {
436+ has_bf16_overflows_local = true ;
437+ }
445438 }
446439 });
447440 }
0 commit comments