@@ -262,6 +262,71 @@ void Input::cloneBlobIfRequired() {
262262 needFlushDenormalsToZero = false ;
263263 }
264264
265+ // The presence of subnormals is better to determined at IR read time.
266+ auto checkSubnormalsAndBF16Overflows = [&](bool & has_subnormals, bool & has_bf16_overflows) {
267+ if (prec == ov::element::f32 ) {
268+ uint32_t const * u32data = m_constOp->get_data_ptr <uint32_t >();
269+ float const * f32data = m_constOp->get_data_ptr <float >();
270+
271+ if (!size)
272+ return ;
273+
274+ const float bf16_max = 3 .3895313899137927e38f;
275+
276+ #if defined(OPENVINO_ARCH_X86_64)
277+ if (auto fn = jit_has_subnormals_function ()) {
278+ static const size_t batch_size = 2048 ;
279+ const size_t iterations_num = size / batch_size + 1 ;
280+
281+ volatile bool has_subnormals_local = false ;
282+
283+ parallel_for (iterations_num, [&](int n) {
284+ auto ptr = u32data + n * batch_size;
285+ const jit_has_subnormals_base::args_t args = {reinterpret_cast <float const *>(ptr),
286+ std::min (batch_size, (size_t )(u32data + size - ptr)),
287+ false };
288+
289+ fn (&args);
290+
291+ if (args.hasSubnormals )
292+ has_subnormals_local = true ;
293+ });
294+
295+ has_subnormals = has_subnormals_local;
296+ // TODO: opt with jit
297+ for (size_t i = 0 ; i < size; ++i) {
298+ if (!std::isnan (f32data[i]) && !std::isinf (f32data[i]) &&
299+ (f32data[i] < -bf16_max || f32data[i] > bf16_max)) {
300+ has_bf16_overflows = true ;
301+ return ;
302+ }
303+ }
304+ return ;
305+ }
306+ #endif
307+
308+ uint32_t mantissaMask = 0x007fffff ;
309+ uint32_t exponentMask = 0x7f800000 ;
310+ for (size_t i = 0 ; i < size; ++i) {
311+ if ((u32data[i] & exponentMask) == 0 && (u32data[i] & mantissaMask) != 0 ) {
312+ has_subnormals = true ;
313+ }
314+ if (!std::isnan (f32data[i]) && !std::isinf (f32data[i]) &&
315+ (f32data[i] < -bf16_max || f32data[i] > bf16_max)) {
316+ has_bf16_overflows = true ;
317+ }
318+ if (has_subnormals && has_bf16_overflows) {
319+ return ;
320+ }
321+ }
322+ }
323+ };
324+
325+ bool has_subnormals = false ;
326+ bool has_bf16_overflows = false ;
327+
328+ checkSubnormalsAndBF16Overflows (has_subnormals, has_bf16_overflows);
329+
265330 auto cloneBlob = [&, this ]() {
266331 MemoryPtr memory;
267332
@@ -294,7 +359,7 @@ void Input::cloneBlobIfRequired() {
294359 } else {
295360 ptr = std::make_shared<StaticMemory>(getEngine (), memDesc);
296361 }
297- ptr->load (*memory.get (), needFlushDenormalsToZero);
362+ ptr->load (*memory.get (), needFlushDenormalsToZero, has_bf16_overflows );
298363
299364 return ptr;
300365 };
@@ -311,60 +376,22 @@ void Input::cloneBlobIfRequired() {
311376#endif
312377 };
313378
314- // The presence of subnormals is better to determined at IR read time.
315- auto hasSubnormals = [&]() {
316- if (prec == ov::element::f32 ) {
317- uint32_t const * u32data = m_constOp->get_data_ptr <uint32_t >();
318-
319- if (!size)
320- return false ;
321-
322- #if defined(OPENVINO_ARCH_X86_64)
323- if (auto fn = jit_has_subnormals_function ()) {
324- static const size_t batch_size = 2048 ;
325- const size_t iterations_num = size / batch_size + 1 ;
326-
327- volatile bool has_subnormals = false ;
328-
329- parallel_for (iterations_num, [&](int n) {
330- auto ptr = u32data + n * batch_size;
331- const jit_has_subnormals_base::args_t args = {reinterpret_cast <float const *>(ptr),
332- std::min (batch_size, (size_t )(u32data + size - ptr)),
333- false };
334-
335- fn (&args);
336-
337- if (args.hasSubnormals )
338- has_subnormals = true ;
339- });
340-
341- return has_subnormals;
342- }
343- #endif
344-
345- uint32_t mantissaMask = 0x007fffff ;
346- uint32_t exponentMask = 0x7f800000 ;
347- for (size_t i = 0 ; i < size; ++i) {
348- if ((u32data[i] & exponentMask) == 0 && (u32data[i] & mantissaMask) != 0 ) {
349- return true ;
350- }
351- }
352- }
353- return false ;
354- };
355-
356379 auto blobKey = [&]() {
357380 char ptr[32 ];
358381 snprintf (ptr, sizeof ptr, " %p" , m_constOp->get_data_ptr ());
359382 return getName () + " _" + std::to_string (size * prec.size ()) + " _" + ptr;
360383 };
361384
385+ // my test
386+ if (has_bf16_overflows) {
387+ std::cout << " my test: has_bf16_overflows" << std::endl;
388+ }
362389 const auto weightCache = context->getWeightsCache ();
363390 const bool clone_is_not_needed =
364391 prec != element::string &&
365392 // IRs already have all subnormals flushed to zero, but in
366393 // read_model scenario with directly loaded original model still can have subnormals
367- isBlobAligned (m_constOp) && (!needFlushDenormalsToZero || !hasSubnormals ()) &&
394+ isBlobAligned (m_constOp) && (!needFlushDenormalsToZero || !has_subnormals) && !has_bf16_overflows &&
368395 // Blob should be cloned in cache only if original weights are stored on other numa node.
369396 // This is possible only in multistream case on multisocket machine.
370397 // TODO: don't clone blob for multisocket + multistream case if current stream is run on the numa node where
0 commit comments