diff --git a/CMakeLists.txt b/CMakeLists.txt
index 459f11b3b3..a12cb60548 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -51,7 +51,7 @@ string(REGEX REPLACE ".*#define TBB_VERSION_MAJOR ([0-9]+).*" "\\1" _tbb_ver_maj
 string(REGEX REPLACE ".*#define TBB_VERSION_MINOR ([0-9]+).*" "\\1" _tbb_ver_minor "${_tbb_version_info}")
 string(REGEX REPLACE ".*#define TBB_INTERFACE_VERSION ([0-9]+).*" "\\1" TBB_INTERFACE_VERSION "${_tbb_version_info}")
 string(REGEX REPLACE ".*#define __TBB_BINARY_VERSION ([0-9]+).*" "\\1" TBB_BINARY_VERSION "${_tbb_version_info}")
-set(TBB_BINARY_MINOR_VERSION 0)
+set(TBB_BINARY_MINOR_VERSION 1)
 set(TBBMALLOC_BINARY_VERSION 2)
 set(TBBBIND_BINARY_VERSION 3)
 
@@ -203,18 +203,18 @@ else()
     install(DIRECTORY include
             DESTINATION .)
 
-    install(EXPORT ${CMAKE_PROJECT_NAME}Targets
+    install(EXPORT ${PROJECT_NAME}Targets
             NAMESPACE TBB::
-            DESTINATION lib/cmake/${CMAKE_PROJECT_NAME})
-    file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_PROJECT_NAME}Config.cmake
-               "include(\${CMAKE_CURRENT_LIST_DIR}/${CMAKE_PROJECT_NAME}Targets.cmake)\n")
+            DESTINATION lib/cmake/${PROJECT_NAME})
+    file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake
+               "include(\${CMAKE_CURRENT_LIST_DIR}/${PROJECT_NAME}Targets.cmake)\n")
 
-    write_basic_package_version_file("${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_PROJECT_NAME}ConfigVersion.cmake"
+    write_basic_package_version_file("${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake"
                                      COMPATIBILITY AnyNewerVersion)
 
-    install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_PROJECT_NAME}Config.cmake"
-                  "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_PROJECT_NAME}ConfigVersion.cmake"
-            DESTINATION lib/cmake/${CMAKE_PROJECT_NAME})
+    install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}Config.cmake"
+                  "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake"
+            DESTINATION lib/cmake/${PROJECT_NAME})
     # -------------------------------------------------------------------
 endif()
 
diff --git a/README.md b/README.md
index d6c3470976..386a7070bf 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 # oneAPI Threading Building Blocks (Beta)
-[![Apache License Version 2.0](https://img.shields.io/badge/license-Apache_2.0-green.svg)](LICENSE)
+[![Apache License Version 2.0](https://img.shields.io/badge/license-Apache_2.0-green.svg)](LICENSE.txt)
 
 oneAPI Threading Building Blocks (oneTBB) lets you easily write parallel C++ programs that take
 full advantage of multicore performance, that are portable, composable and have future-proof scalability.
@@ -19,7 +19,7 @@ Please report issues and suggestions via
 
 ## How to Contribute
 To contribute to oneTBB, please open a GitHub pull request (preferred) or send us a patch by e-mail.
-oneAPI Threading Building Blocks is licensed under [Apache License, Version 2.0](LICENSE).
+oneAPI Threading Building Blocks is licensed under [Apache License, Version 2.0](LICENSE.txt).
 By its terms, contributions submitted to the project are also done under that license.
 
 ## Engineering team contacts
diff --git a/cmake/compilers/GNU.cmake b/cmake/compilers/GNU.cmake
index f86e6e8084..9bff049f47 100644
--- a/cmake/compilers/GNU.cmake
+++ b/cmake/compilers/GNU.cmake
@@ -45,4 +45,4 @@ endif()
 
 # TBB malloc settings
 set(TBBMALLOC_LIB_COMPILE_FLAGS -fno-rtti -fno-exceptions)
-
+set(TBB_OPENMP_FLAG -fopenmp)
diff --git a/cmake/compilers/Intel.cmake b/cmake/compilers/Intel.cmake
index 9817b50eb0..871dc4b9d8 100644
--- a/cmake/compilers/Intel.cmake
+++ b/cmake/compilers/Intel.cmake
@@ -15,14 +15,17 @@
 if (MSVC)
     include(${CMAKE_CURRENT_LIST_DIR}/MSVC.cmake)
     set(TBB_WARNING_LEVEL ${TBB_WARNING_LEVEL} /W3)
+    set(TBB_OPENMP_FLAG /Qopenmp)
 elseif (APPLE)
     include(${CMAKE_CURRENT_LIST_DIR}/AppleClang.cmake)
     set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} -fstack-protector -Wformat -Wformat-security
                                  $<$<NOT:$<CONFIG:Debug>>:-fno-omit-frame-pointer -qno-opt-report-embed -D_FORTIFY_SOURCE=2>)
+    set(TBB_OPENMP_FLAG -qopenmp)
 else()
     include(${CMAKE_CURRENT_LIST_DIR}/GNU.cmake)
     set(TBB_LIB_LINK_FLAGS ${TBB_LIB_LINK_FLAGS} -static-intel -Wl,-z,relro,-z,now,)
     set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} -fstack-protector -Wformat -Wformat-security
                                  $<$<NOT:$<CONFIG:Debug>>:-qno-opt-report-embed -D_FORTIFY_SOURCE=2>
                                  $<$<EQUAL:${TBB_ARCH},32>:-falign-stack=maintain-16-byte>)
+    set(TBB_OPENMP_FLAG -qopenmp)
 endif()
diff --git a/cmake/compilers/MSVC.cmake b/cmake/compilers/MSVC.cmake
index a0962402d1..54a0fb8801 100644
--- a/cmake/compilers/MSVC.cmake
+++ b/cmake/compilers/MSVC.cmake
@@ -57,3 +57,5 @@ if (TBB_WINDOWS_DRIVER)
 
     set(TBB_COMMON_COMPILE_FLAGS ${TBB_COMMON_COMPILE_FLAGS} /D _UNICODE /DUNICODE /DWINAPI_FAMILY=WINAPI_FAMILY_APP /D__WRL_NO_DEFAULT_LIB__)
 endif()
+
+set(TBB_OPENMP_FLAG /openmp)
diff --git a/cmake/packaging.cmake b/cmake/packaging.cmake
index 52eca45247..10702fc065 100644
--- a/cmake/packaging.cmake
+++ b/cmake/packaging.cmake
@@ -14,7 +14,7 @@
 
 # Note: current implementation uses CMAKE_BUILD_TYPE,
 # this parameter is not defined for multi-config generators.
-set(CPACK_PACKAGE_NAME "${CMAKE_PROJECT_NAME}")
+set(CPACK_PACKAGE_NAME "${PROJECT_NAME}")
 set(CPACK_PACKAGE_VERSION "${TBB_VERSION}")
 string(TOLOWER ${CPACK_PACKAGE_NAME}-${PROJECT_VERSION}-${CMAKE_SYSTEM_NAME}_${TBB_OUTPUT_DIR_BASE}_${CMAKE_BUILD_TYPE} CPACK_PACKAGE_FILE_NAME)
 set(CPACK_GENERATOR ZIP)
diff --git a/cmake/vars_utils.cmake b/cmake/vars_utils.cmake
index 252ea17e05..a7b2fbe97d 100644
--- a/cmake/vars_utils.cmake
+++ b/cmake/vars_utils.cmake
@@ -22,19 +22,19 @@ else()
     set(TBB_VARS_TEMPLATE "linux/env/vars.sh.in")
 endif()
 
-get_filename_component(TBB_VARS_TEMPLATE_NAME ${CMAKE_SOURCE_DIR}/integration/${TBB_VARS_TEMPLATE} NAME)
+get_filename_component(TBB_VARS_TEMPLATE_NAME ${PROJECT_SOURCE_DIR}/integration/${TBB_VARS_TEMPLATE} NAME)
 string(REPLACE ".in" "" TBB_VARS_NAME ${TBB_VARS_TEMPLATE_NAME})
 
 macro(tbb_gen_vars target)
     add_custom_command(TARGET ${target} POST_BUILD COMMAND
         ${CMAKE_COMMAND}
         -DBINARY_DIR=${CMAKE_BINARY_DIR}
-        -DSOURCE_DIR=${CMAKE_SOURCE_DIR}
+        -DSOURCE_DIR=${PROJECT_SOURCE_DIR}
         -DBIN_PATH=$<TARGET_FILE_DIR:${target}>
         -DVARS_TEMPLATE=${TBB_VARS_TEMPLATE}
         -DVARS_NAME=${TBB_VARS_NAME}
         -DTBB_INSTALL_VARS=${TBB_INSTALL_VARS}
-        -P ${CMAKE_SOURCE_DIR}/integration/cmake/generate_vars.cmake
+        -P ${PROJECT_SOURCE_DIR}/integration/cmake/generate_vars.cmake
     )
 endmacro(tbb_gen_vars)
 
diff --git a/doc/DoxygenLayout.xml b/doc/DoxygenLayout.xml
index 30f060df81..cb906b0031 100644
--- a/doc/DoxygenLayout.xml
+++ b/doc/DoxygenLayout.xml
@@ -19,6 +19,7 @@
       <tab type="filelist" visible="no" title="" intro=""/>
       <tab type="globals" visible="no" title="" intro=""/>
     </tab>
+    <tab type="user" url="@ref test_classification" visible="yes" title="Test Classification"/>
     <tab type="examples" visible="yes" title="" intro=""/>  
   </navindex>
 
diff --git a/include/tbb/concurrent_map.h b/include/tbb/concurrent_map.h
index 40f73cb982..74d23db6ca 100644
--- a/include/tbb/concurrent_map.h
+++ b/include/tbb/concurrent_map.h
@@ -91,6 +91,7 @@ class concurrent_map : public concurrent_skip_list<map_traits<Key, Value, Compar
 
     // Include constructors of base type
     using base_type::base_type;
+    using base_type::operator=;
 
     // Observers
     mapped_type& at(const key_type& key) {
@@ -215,6 +216,7 @@ class concurrent_multimap : public concurrent_skip_list<map_traits<Key, Value, C
     // Include constructors of base_type
     using base_type::base_type;
     using base_type::insert;
+    using base_type::operator=;
 
     template <typename P>
     typename std::enable_if<std::is_constructible<value_type, P&&>::value,
diff --git a/include/tbb/concurrent_priority_queue.h b/include/tbb/concurrent_priority_queue.h
index 77ef2b0c1d..b1d48256aa 100644
--- a/include/tbb/concurrent_priority_queue.h
+++ b/include/tbb/concurrent_priority_queue.h
@@ -234,8 +234,6 @@ class concurrent_priority_queue {
         };
         cpq_operation( const value_type& value, operation_type t )
             : type(t), elem(const_cast<value_type*>(&value)) {}
-
-        cpq_operation( operation_type t ) : type(t) {}
     }; // class cpq_operation
 
     class functor {
@@ -477,7 +475,7 @@ void swap( concurrent_priority_queue<T, Compare, Allocator>& lhs,
 inline namespace v1 {
 using detail::d1::concurrent_priority_queue;
 
-}; // inline namespace v1
+} // inline namespace v1
 } // namespace tbb
 
 #endif // __TBB_concurrent_priority_queue_H
diff --git a/include/tbb/concurrent_set.h b/include/tbb/concurrent_set.h
index e62cce936c..adf8bba4d1 100644
--- a/include/tbb/concurrent_set.h
+++ b/include/tbb/concurrent_set.h
@@ -74,6 +74,7 @@ class concurrent_set : public concurrent_skip_list<set_traits<Key, Compare, conc
 
     // Include constructors of base_type
     using base_type::base_type;
+    using base_type::operator=;
 
     template<typename OtherCompare>
     void merge(concurrent_set<key_type, OtherCompare, Allocator>& source) {
@@ -148,6 +149,7 @@ class concurrent_multiset : public concurrent_skip_list<set_traits<Key, Compare,
 
     // Include constructors of base_type;
     using base_type::base_type;
+    using base_type::operator=;
 
     template<typename OtherCompare>
     void merge(concurrent_set<key_type, OtherCompare, Allocator>& source) {
diff --git a/include/tbb/concurrent_unordered_map.h b/include/tbb/concurrent_unordered_map.h
index 309df0cc23..80fb662426 100644
--- a/include/tbb/concurrent_unordered_map.h
+++ b/include/tbb/concurrent_unordered_map.h
@@ -69,6 +69,7 @@ class concurrent_unordered_map
 
     // Include constructors of base type
     using base_type::base_type;
+    using base_type::operator=;
 
     // Observers
     mapped_type& operator[]( const key_type& key ) {
@@ -220,7 +221,7 @@ class concurrent_unordered_multimap
 
     // Include constructors of base type
     using base_type::base_type;
-
+    using base_type::operator=;
     using base_type::insert;
 
     template <typename P>
diff --git a/include/tbb/concurrent_unordered_set.h b/include/tbb/concurrent_unordered_set.h
index 4c0b51215c..d7932a55b0 100644
--- a/include/tbb/concurrent_unordered_set.h
+++ b/include/tbb/concurrent_unordered_set.h
@@ -67,6 +67,7 @@ class concurrent_unordered_set
 
     // Include constructors of base_type;
     using base_type::base_type;
+    using base_type::operator=;
 
     template <typename OtherHash, typename OtherKeyEqual>
     void merge( concurrent_unordered_set<key_type, OtherHash, OtherKeyEqual, allocator_type>& source ) {
@@ -162,6 +163,7 @@ class concurrent_unordered_multiset
 
     // Include constructors of base_type;
     using base_type::base_type;
+    using base_type::operator=;
 
     template <typename OtherHash, typename OtherKeyEqual>
     void merge( concurrent_unordered_set<key_type, OtherHash, OtherKeyEqual, allocator_type>& source ) {
diff --git a/include/tbb/detail/_aggregator.h b/include/tbb/detail/_aggregator.h
index 247be3107e..9b846915da 100644
--- a/include/tbb/detail/_aggregator.h
+++ b/include/tbb/detail/_aggregator.h
@@ -143,7 +143,6 @@ class aggregator : public aggregator_generic<OperationType> {
     HandlerType handle_operations;
 public:
     aggregator() = default;
-    explicit aggregator( HandlerType h ) : handle_operations(h) {}
 
     void initialize_handler( HandlerType h ) { handle_operations = h; }
 
@@ -167,8 +166,8 @@ class aggregating_functor {
 }; // class aggregating_functor
 
 
-}; // namespace d1
-}; // namespace detail
-}; // namespace tbb
+} // namespace d1
+} // namespace detail
+} // namespace tbb
 
 #endif // __TBB_detail__aggregator_H
diff --git a/include/tbb/detail/_concurrent_unordered_base.h b/include/tbb/detail/_concurrent_unordered_base.h
index 94a8a78618..93911bd6fa 100644
--- a/include/tbb/detail/_concurrent_unordered_base.h
+++ b/include/tbb/detail/_concurrent_unordered_base.h
@@ -708,6 +708,8 @@ class concurrent_unordered_base {
             return my_midpoint_node != my_end_node;
         }
 
+        size_type grainsize() const { return 1; }
+
         const_range_type( const_range_type& range, split )
             : my_instance(range.my_instance),
               my_begin_node(range.my_midpoint_node),
diff --git a/include/tbb/detail/_config.h b/include/tbb/detail/_config.h
index 3e7489aef4..953732ed82 100644
--- a/include/tbb/detail/_config.h
+++ b/include/tbb/detail/_config.h
@@ -296,6 +296,8 @@
 
 #define __TBB_TSX_INTRINSICS_PRESENT ((__RTM__ || _MSC_VER>=1700 || __INTEL_COMPILER) && !__ANDROID__)
 
+#define __TBB_WAITPKG_INTRINSICS_PRESENT ((__INTEL_COMPILER >= 1900 || __TBB_GCC_VERSION >= 110000 || __TBB_CLANG_VERSION >= 120000) && !__ANDROID__)
+
 /** Internal TBB features & modes **/
 
 /** __TBB_SOURCE_DIRECTLY_INCLUDED is a mode used in whitebox testing when
@@ -334,7 +336,7 @@
     #define __TBB_ARENA_OBSERVER __TBB_SCHEDULER_OBSERVER
 #endif /* __TBB_ARENA_OBSERVER */
 
-#if TBB_PREVIEW_NUMA_SUPPORT || __TBB_BUILD
+#ifndef __TBB_NUMA_SUPPORT
     #define __TBB_NUMA_SUPPORT 1
 #endif
 
diff --git a/include/tbb/detail/_flow_graph_body_impl.h b/include/tbb/detail/_flow_graph_body_impl.h
index a66a42a683..eb4e59934c 100644
--- a/include/tbb/detail/_flow_graph_body_impl.h
+++ b/include/tbb/detail/_flow_graph_body_impl.h
@@ -218,7 +218,6 @@ class type_to_key_function_body_leaf : public type_to_key_function_body<Input, O
 public:
     type_to_key_function_body_leaf( const B &_body ) : body(_body) { }
     Output operator()(const Input &i) override { return body(i); }
-    B get_body() { return body; }
     type_to_key_function_body_leaf* clone() override {
         return new type_to_key_function_body_leaf< Input, Output, B>(body);
     }
@@ -233,7 +232,6 @@ class type_to_key_function_body_leaf<Input,Output&,B> : public type_to_key_funct
     const Output& operator()(const Input &i) override {
         return body(i);
     }
-    B get_body() { return body; }
     type_to_key_function_body_leaf* clone() override {
         return new type_to_key_function_body_leaf< Input, Output&, B>(body);
     }
@@ -313,12 +311,6 @@ class input_node_task_bypass : public graph_task {
 
 // ------------------------ end of node task bodies -----------------------------------
 
-//! An empty functor that takes an Input and returns a default constructed Output
-template< typename Input, typename Output >
-struct empty_body {
-    Output operator()( const Input & ) const { return Output(); }
-};
-
 template<typename T, typename DecrementType, typename DummyType = void>
 class decrementer;
 
@@ -342,15 +334,12 @@ class decrementer<T, DecrementType,
     }
 
     template<typename U, typename V> friend class limiter_node;
-    void reset_receiver( reset_flags ) override {
-    }
+    void reset_receiver( reset_flags ) {}
 
 public:
-    // Since decrementer does not make use of possibly unconstructed owner inside its
-    // constructor, my_node can be directly initialized with 'this' pointer passed from the
-    // owner, hence making method 'set_owner' needless.
-    decrementer() : my_node(NULL) {}
-    void set_owner( T *node ) { my_node = node; }
+    decrementer(T* owner) : my_node(owner) {
+        // Do not work with the passed pointer here as it may not be fully initialized yet
+    }
 };
 
 template<typename T>
@@ -372,13 +361,11 @@ class decrementer<T, continue_msg, void> : public continue_receiver, no_copy {
 
     typedef continue_msg input_type;
     typedef continue_msg output_type;
-    decrementer() : continue_receiver( /*number_of_predecessors=*/0, no_priority )
-          // Since decrementer does not make use of possibly unconstructed owner inside its
-          // constructor, my_node can be directly initialized with 'this' pointer passed from the
-          // owner, hence making method 'set_owner' needless.
-        , my_node(NULL)
-    {}
-    void set_owner( T *node ) { my_node = node; }
+    decrementer(T* owner)
+        : continue_receiver( /*number_of_predecessors=*/0, no_priority ), my_node(owner)
+    {
+        // Do not work with the passed pointer here as it may not be fully initialized yet
+    }
 };
 
 #endif // __TBB__flow_graph_body_impl_H
diff --git a/include/tbb/detail/_flow_graph_cache_impl.h b/include/tbb/detail/_flow_graph_cache_impl.h
index e93b16c69a..688ef944d8 100644
--- a/include/tbb/detail/_flow_graph_cache_impl.h
+++ b/include/tbb/detail/_flow_graph_cache_impl.h
@@ -44,7 +44,8 @@ class node_cache {
         typename mutex_type::scoped_lock lock( my_mutex );
         for ( size_t i = internal_size(); i != 0; --i ) {
             T &s = internal_pop();
-            if ( &s == &n )  return;  // only remove one predecessor per request
+            if ( &s == &n )
+                break;  // only remove one predecessor per request
             internal_push(s);
         }
     }
@@ -92,11 +93,12 @@ class predecessor_cache : public node_cache< sender<T>, M > {
     typedef sender<output_type> predecessor_type;
     typedef receiver<output_type> successor_type;
 
-    predecessor_cache( ) : my_owner( NULL ) { }
-
-    void set_owner( successor_type *owner ) { my_owner = owner; }
+    predecessor_cache( successor_type* owner ) : my_owner( owner ) {
+        __TBB_ASSERT( my_owner, "predecessor_cache should have an owner." );
+        // Do not work with the passed pointer here as it may not be fully initialized yet
+    }
 
-    bool get_item( output_type &v ) {
+    bool get_item( output_type& v ) {
 
         bool msg = false;
 
@@ -115,8 +117,7 @@ class predecessor_cache : public node_cache< sender<T>, M > {
 
             if (msg == false) {
                 // Relinquish ownership of the edge
-                if (my_owner)
-                    register_successor(*src, *my_owner);
+                register_successor(*src, *my_owner);
             } else {
                 // Retain ownership of the edge
                 this->add(*src);
@@ -127,20 +128,18 @@ class predecessor_cache : public node_cache< sender<T>, M > {
 
     // If we are removing arcs (rf_clear_edges), call clear() rather than reset().
     void reset() {
-        if (my_owner) {
-            for(;;) {
-                predecessor_type *src;
-                {
-                    if (this->internal_empty()) break;
-                    src = &this->internal_pop();
-                }
-                register_successor(*src, *my_owner);
+        for(;;) {
+            predecessor_type *src;
+            {
+                if (this->internal_empty()) break;
+                src = &this->internal_pop();
             }
+            register_successor(*src, *my_owner);
         }
     }
 
 protected:
-    successor_type *my_owner;
+    successor_type* my_owner;
 };
 
 //! An cache of predecessors that supports requests and reservations
@@ -152,7 +151,11 @@ class reservable_predecessor_cache : public predecessor_cache< T, M > {
     typedef sender<T> predecessor_type;
     typedef receiver<T> successor_type;
 
-    reservable_predecessor_cache( ) : reserved_src(NULL) { }
+    reservable_predecessor_cache( successor_type* owner )
+        : predecessor_cache<T,M>(owner), reserved_src(NULL)
+    {
+        // Do not work with the passed pointer here as it may not be fully initialized yet
+    }
 
     bool
     try_reserve( output_type &v ) {
@@ -222,22 +225,22 @@ class successor_cache : no_copy {
     mutex_type my_mutex;
 
     typedef receiver<T> successor_type;
-    typedef receiver<T> *pointer_type;
+    typedef receiver<T>* pointer_type;
     typedef sender<T> owner_type;
     // TODO revamp: introduce heapified collection of successors for strict priorities
     typedef std::list< pointer_type > successors_type;
     successors_type my_successors;
 
-    owner_type *my_owner;
+    owner_type* my_owner;
 
 public:
-    successor_cache( ) : my_owner(NULL) {}
-
-    void set_owner( owner_type *owner ) { my_owner = owner; }
+    successor_cache( owner_type* owner ) : my_owner(owner) {
+        // Do not work with the passed pointer here as it may not be fully initialized yet
+    }
 
     virtual ~successor_cache() {}
 
-    void register_successor( successor_type &r ) {
+    void register_successor( successor_type& r ) {
         typename mutex_type::scoped_lock l(my_mutex, true);
         if( r.priority() != no_priority )
             my_successors.push_front( &r );
@@ -245,7 +248,7 @@ class successor_cache : no_copy {
             my_successors.push_back( &r );
     }
 
-    void remove_successor( successor_type &r ) {
+    void remove_successor( successor_type& r ) {
         typename mutex_type::scoped_lock l(my_mutex, true);
         for ( typename successors_type::iterator i = my_successors.begin();
               i != my_successors.end(); ++i ) {
@@ -265,8 +268,8 @@ class successor_cache : no_copy {
         my_successors.clear();
     }
 
-    virtual graph_task* try_put_task( const T &t ) = 0;
- };  // successor_cache<T>
+    virtual graph_task* try_put_task( const T& t ) = 0;
+};  // successor_cache<T>
 
 //! An abstract cache of successors, specialized to continue_msg
 template<typename M>
@@ -277,38 +280,38 @@ class successor_cache< continue_msg, M > : no_copy {
     mutex_type my_mutex;
 
     typedef receiver<continue_msg> successor_type;
-    typedef receiver<continue_msg> *pointer_type;
+    typedef receiver<continue_msg>* pointer_type;
+    typedef sender<continue_msg> owner_type;
     typedef std::list< pointer_type > successors_type;
     successors_type my_successors;
-    sender<continue_msg> *my_owner;
+    owner_type* my_owner;
 
 public:
-    successor_cache( ) : my_owner(NULL) {}
-
-    void set_owner( sender<continue_msg> *owner ) { my_owner = owner; }
+    successor_cache( sender<continue_msg>* owner ) : my_owner(owner) {
+        // Do not work with the passed pointer here as it may not be fully initialized yet
+    }
 
     virtual ~successor_cache() {}
 
-    void register_successor( successor_type &r ) {
+    void register_successor( successor_type& r ) {
         typename mutex_type::scoped_lock l(my_mutex, true);
         if( r.priority() != no_priority )
             my_successors.push_front( &r );
         else
             my_successors.push_back( &r );
-        if ( my_owner && r.is_continue_receiver() ) {
+        __TBB_ASSERT( my_owner, "Cache of successors must have an owner." );
+        if ( r.is_continue_receiver() ) {
             r.register_predecessor( *my_owner );
         }
     }
 
-    void remove_successor( successor_type &r ) {
+    void remove_successor( successor_type& r ) {
         typename mutex_type::scoped_lock l(my_mutex, true);
-        for ( successors_type::iterator i = my_successors.begin();
-              i != my_successors.end(); ++i ) {
-            if ( *i == & r ) {
-                // TODO: Check if we need to test for continue_receiver before
-                // removing from r.
-                if ( my_owner )
-                    r.remove_predecessor( *my_owner );
+        for ( successors_type::iterator i = my_successors.begin(); i != my_successors.end(); ++i ) {
+            if ( *i == &r ) {
+                __TBB_ASSERT(my_owner, "Cache of successors must have an owner.");
+                // TODO: check if we need to test for continue_receiver before removing from r.
+                r.remove_predecessor( *my_owner );
                 my_successors.erase(i);
                 break;
             }
@@ -324,24 +327,26 @@ class successor_cache< continue_msg, M > : no_copy {
         my_successors.clear();
     }
 
-    virtual graph_task* try_put_task( const continue_msg &t ) = 0;
+    virtual graph_task* try_put_task( const continue_msg& t ) = 0;
 };  // successor_cache< continue_msg >
 
 //! A cache of successors that are broadcast to
 template<typename T, typename M=spin_rw_mutex>
 class broadcast_cache : public successor_cache<T, M> {
+    typedef successor_cache<T, M> base_type;
     typedef M mutex_type;
     typedef typename successor_cache<T,M>::successors_type successors_type;
 
 public:
 
-    broadcast_cache( ) {}
+    broadcast_cache( typename base_type::owner_type* owner ): base_type(owner) {
+        // Do not work with the passed pointer here as it may not be fully initialized yet
+    }
 
     // as above, but call try_put_task instead, and return the last task we received (if any)
     graph_task* try_put_task( const T &t ) override {
         graph_task * last_task = nullptr;
-        bool upgraded = true;
-        typename mutex_type::scoped_lock l(this->my_mutex, upgraded);
+        typename mutex_type::scoped_lock l(this->my_mutex, /*write=*/true);
         typename successors_type::iterator i = this->my_successors.begin();
         while ( i != this->my_successors.end() ) {
             graph_task *new_task = (*i)->try_put_task(t);
@@ -353,10 +358,6 @@ class broadcast_cache : public successor_cache<T, M> {
             }
             else {  // failed
                 if ( (*i)->register_predecessor(*this->my_owner) ) {
-                    if (!upgraded) {
-                        l.upgrade_to_writer();
-                        upgraded = true;
-                    }
                     i = this->my_successors.erase(i);
                 } else {
                     ++i;
@@ -368,9 +369,8 @@ class broadcast_cache : public successor_cache<T, M> {
 
     // call try_put_task and return list of received tasks
     bool gather_successful_try_puts( const T &t, graph_task_list& tasks ) {
-        bool upgraded = true;
         bool is_at_least_one_put_successful = false;
-        typename mutex_type::scoped_lock l(this->my_mutex, upgraded);
+        typename mutex_type::scoped_lock l(this->my_mutex, /*write=*/true);
         typename successors_type::iterator i = this->my_successors.begin();
         while ( i != this->my_successors.end() ) {
             graph_task * new_task = (*i)->try_put_task(t);
@@ -383,10 +383,6 @@ class broadcast_cache : public successor_cache<T, M> {
             }
             else {  // failed
                 if ( (*i)->register_predecessor(*this->my_owner) ) {
-                    if (!upgraded) {
-                        l.upgrade_to_writer();
-                        upgraded = true;
-                    }
                     i = this->my_successors.erase(i);
                 } else {
                     ++i;
@@ -400,13 +396,16 @@ class broadcast_cache : public successor_cache<T, M> {
 //! A cache of successors that are put in a round-robin fashion
 template<typename T, typename M=spin_rw_mutex >
 class round_robin_cache : public successor_cache<T, M> {
+    typedef successor_cache<T, M> base_type;
     typedef size_t size_type;
     typedef M mutex_type;
     typedef typename successor_cache<T,M>::successors_type successors_type;
 
 public:
 
-    round_robin_cache( ) {}
+    round_robin_cache( typename base_type::owner_type* owner ): base_type(owner) {
+        // Do not work with the passed pointer here as it may not be fully initialized yet
+    }
 
     size_type size() {
         typename mutex_type::scoped_lock l(this->my_mutex, false);
@@ -414,8 +413,7 @@ class round_robin_cache : public successor_cache<T, M> {
     }
 
     graph_task* try_put_task( const T &t ) override {
-        bool upgraded = true;
-        typename mutex_type::scoped_lock l(this->my_mutex, upgraded);
+        typename mutex_type::scoped_lock l(this->my_mutex, /*write=*/true);
         typename successors_type::iterator i = this->my_successors.begin();
         while ( i != this->my_successors.end() ) {
             graph_task* new_task = (*i)->try_put_task(t);
@@ -423,10 +421,6 @@ class round_robin_cache : public successor_cache<T, M> {
                 return new_task;
             } else {
                if ( (*i)->register_predecessor(*this->my_owner) ) {
-                   if (!upgraded) {
-                       l.upgrade_to_writer();
-                       upgraded = true;
-                   }
                    i = this->my_successors.erase(i);
                }
                else {
diff --git a/include/tbb/detail/_flow_graph_impl.h b/include/tbb/detail/_flow_graph_impl.h
index b11c7a02de..e489494f3c 100644
--- a/include/tbb/detail/_flow_graph_impl.h
+++ b/include/tbb/detail/_flow_graph_impl.h
@@ -52,9 +52,6 @@ class graph_iterator {
     typedef const GraphNodeType& const_reference;
     typedef std::forward_iterator_tag iterator_category;
 
-    //! Default constructor
-    graph_iterator() : my_graph(NULL), current_node(NULL) {}
-
     //! Copy constructor
     graph_iterator(const graph_iterator& other) :
         my_graph(other.my_graph), current_node(other.current_node)
@@ -137,9 +134,9 @@ class graph_task : public task {
     // TODO revamp: rename to my_priority
     node_priority_t priority;
     void destruct_and_deallocate(const execution_data& ed);
+    task* cancel(execution_data& ed) override;
 protected:
     void finalize(const execution_data& ed);
-    task* cancel(execution_data& ed) override;
 private:
     // To organize task_list
     graph_task* my_next{ nullptr };
@@ -160,29 +157,38 @@ typedef tbb::concurrent_priority_queue<graph_task*, graph_task_comparator> graph
 class priority_task_selector : public task {
 public:
     priority_task_selector(graph_task_priority_queue_t& priority_queue, small_object_allocator& allocator)
-        : my_priority_queue(priority_queue), my_allocator(allocator) {}
+        : my_priority_queue(priority_queue), my_allocator(allocator), my_task() {}
     task* execute(execution_data& ed) override {
-        graph_task* t = nullptr;
-        // TODO revamp: hold functors in priority queue instead of real tasks
-        bool result = my_priority_queue.try_pop(t);
-        __TBB_ASSERT_EX( result, "Number of critical tasks for scheduler and tasks"
-                         " in graph's priority queue mismatched" );
-        __TBB_ASSERT( t && t != SUCCESSFULLY_ENQUEUED,
-                      "Incorrect task submitted to graph priority queue" );
-        __TBB_ASSERT( t->priority != no_priority,
-                      "Tasks from graph's priority queue must have priority" );
-        // TODO revamp: consider cancellation and exception handling for the task t.
-        task* t_next = t->execute(ed);
+        next_task();
+        __TBB_ASSERT(my_task, nullptr);
+        task* t_next = my_task->execute(ed);
         my_allocator.delete_object(this, ed);
         return t_next;
     }
     task* cancel(execution_data& ed) override {
+        if (!my_task) {
+            next_task();
+        }
+        __TBB_ASSERT(my_task, nullptr);
+        task* t_next = my_task->cancel(ed);
         my_allocator.delete_object(this, ed);
-        return nullptr;
+        return t_next;
     }
 private:
+    void next_task() {
+        // TODO revamp: hold functors in priority queue instead of real tasks
+        bool result = my_priority_queue.try_pop(my_task);
+        __TBB_ASSERT_EX(result, "Number of critical tasks for scheduler and tasks"
+            " in graph's priority queue mismatched");
+        __TBB_ASSERT(my_task && my_task != SUCCESSFULLY_ENQUEUED,
+            "Incorrect task submitted to graph priority queue");
+        __TBB_ASSERT(my_task->priority != no_priority,
+            "Tasks from graph's priority queue must have priority");
+    }
+
     graph_task_priority_queue_t& my_priority_queue;
     small_object_allocator my_allocator;
+    graph_task* my_task;
 };
 
 template <typename Receiver, typename Body> class run_and_put_task;
diff --git a/include/tbb/detail/_flow_graph_indexer_impl.h b/include/tbb/detail/_flow_graph_indexer_impl.h
index 33ef530265..c4d33328b5 100644
--- a/include/tbb/detail/_flow_graph_indexer_impl.h
+++ b/include/tbb/detail/_flow_graph_indexer_impl.h
@@ -45,11 +45,6 @@
             std::get<N-1>(my_input).set_up(p, indexer_node_put_task, g);
             indexer_helper<TupleTypes,N-1>::template set_indexer_node_pointer<IndexerNodeBaseType,PortTuple>(my_input, p, g);
         }
-        template<typename InputTuple>
-        static inline void reset_inputs(InputTuple &my_input, reset_flags f) {
-            indexer_helper<TupleTypes,N-1>::reset_inputs(my_input, f);
-            std::get<N-1>(my_input).reset_receiver(f);
-        }
     };
 
     template<typename TupleTypes>
@@ -60,10 +55,6 @@
             graph_task* (*indexer_node_put_task)(const T&, void *) = do_try_put<IndexerNodeBaseType, T, 0>;
             std::get<0>(my_input).set_up(p, indexer_node_put_task, g);
         }
-        template<typename InputTuple>
-        static inline void reset_inputs(InputTuple &my_input, reset_flags f) {
-            std::get<0>(my_input).reset_receiver(f);
-        }
     };
 
     template<typename T>
@@ -91,9 +82,6 @@
         graph& graph_reference() const override {
             return *my_graph;
         }
-
-    public:
-        void reset_receiver(reset_flags /*f*/) override { }
     };
 
     template<typename InputTuple, typename OutputType, typename StructTypes>
@@ -142,7 +130,6 @@
                 type(char(t)), my_arg(e) {}
             indexer_node_base_operation(const successor_type &s, op_type t) : type(char(t)),
                 my_succ(const_cast<successor_type *>(&s)) {}
-            indexer_node_base_operation(op_type t) : type(char(t)) {}
         };
 
         typedef aggregating_functor<class_type, indexer_node_base_operation> handler_type;
@@ -175,15 +162,15 @@
         }
         // ---------- end aggregator -----------
     public:
-        indexer_node_base(graph& g) : graph_node(g), input_ports_type() {
+        indexer_node_base(graph& g) : graph_node(g), input_ports_type(), my_successors(this) {
             indexer_helper<StructTypes,N>::set_indexer_node_pointer(this->my_inputs, this, g);
-            my_successors.set_owner(this);
             my_aggregator.initialize_handler(handler_type(this));
         }
 
-        indexer_node_base(const indexer_node_base& other) : graph_node(other.my_graph), input_ports_type(), sender<output_type>() {
+        indexer_node_base(const indexer_node_base& other)
+            : graph_node(other.my_graph), input_ports_type(), sender<output_type>(), my_successors(this)
+        {
             indexer_helper<StructTypes,N>::set_indexer_node_pointer(this->my_inputs, this, other.my_graph);
-            my_successors.set_owner(this);
             my_aggregator.initialize_handler(handler_type(this));
         }
 
@@ -209,7 +196,6 @@
         void reset_node(reset_flags f) override {
             if(f & rf_clear_edges) {
                 my_successors.clear();
-                indexer_helper<StructTypes,N>::reset_inputs(this->my_inputs,f);
             }
         }
 
diff --git a/include/tbb/detail/_flow_graph_item_buffer_impl.h b/include/tbb/detail/_flow_graph_item_buffer_impl.h
index 15d63c07e1..47186a9df8 100644
--- a/include/tbb/detail/_flow_graph_item_buffer_impl.h
+++ b/include/tbb/detail/_flow_graph_item_buffer_impl.h
@@ -62,7 +62,9 @@ class item_buffer {
     }
 
     bool my_item_valid(size_type i) const { return (i < my_tail) && (i >= my_head) && (item(i).second != no_item); }
+#if TBB_USE_ASSERT
     bool my_item_reserved(size_type i) const { return item(i).second == reserved_item; }
+#endif
 
     // object management in buffer
     const item_type &get_my_item(size_t i) const {
diff --git a/include/tbb/detail/_flow_graph_join_impl.h b/include/tbb/detail/_flow_graph_join_impl.h
index e0763acc37..cf889d785c 100644
--- a/include/tbb/detail/_flow_graph_join_impl.h
+++ b/include/tbb/detail/_flow_graph_join_impl.h
@@ -26,12 +26,22 @@
     struct forwarding_base : no_assign {
         forwarding_base(graph &g) : graph_ref(g) {}
         virtual ~forwarding_base() {}
+        graph& graph_ref;
+    };
+
+    struct queueing_forwarding_base : forwarding_base {
+        using forwarding_base::forwarding_base;
         // decrement_port_count may create a forwarding task.  If we cannot handle the task
         // ourselves, ask decrement_port_count to deal with it.
         virtual graph_task* decrement_port_count(bool handle_task) = 0;
+    };
+
+    struct reserving_forwarding_base : forwarding_base {
+        using forwarding_base::forwarding_base;
+        // decrement_port_count may create a forwarding task.  If we cannot handle the task
+        // ourselves, ask decrement_port_count to deal with it.
+        virtual graph_task* decrement_port_count() = 0;
         virtual void increment_port_count() = 0;
-        // moved here so input ports can queue tasks
-        graph& graph_ref;
     };
 
     // specialization that lets us keep a copy of the current_key for building results.
@@ -40,7 +50,7 @@
     struct matching_forwarding_base : public forwarding_base {
         typedef typename std::decay<KeyType>::type current_key_type;
         matching_forwarding_base(graph &g) : forwarding_base(g) { }
-        virtual graph_task* increment_key_count(current_key_type const & /*t*/, bool /*handle_task*/) = 0; // {return NULL;}
+        virtual graph_task* increment_key_count(current_key_type const & /*t*/) = 0;
         current_key_type current_key; // so ports can refer to FE's desired items
     };
 
@@ -104,15 +114,17 @@
         template<typename InputTuple, typename KeyFuncTuple>
         static inline void set_key_functors(InputTuple &my_input, KeyFuncTuple &my_key_funcs) {
             std::get<N-1>(my_input).set_my_key_func(std::get<N-1>(my_key_funcs));
-            std::get<N-1>(my_key_funcs) = NULL;
+            std::get<N-1>(my_key_funcs) = nullptr;
             join_helper<N-1>::set_key_functors(my_input, my_key_funcs);
         }
 
         template< typename KeyFuncTuple>
         static inline void copy_key_functors(KeyFuncTuple &my_inputs, KeyFuncTuple &other_inputs) {
-            if(std::get<N-1>(other_inputs).get_my_key_func()) {
-                std::get<N-1>(my_inputs).set_my_key_func(std::get<N-1>(other_inputs).get_my_key_func()->clone());
-            }
+            __TBB_ASSERT(
+                std::get<N-1>(other_inputs).get_my_key_func(),
+                "key matching join node should not be instantiated without functors."
+            );
+            std::get<N-1>(my_inputs).set_my_key_func(std::get<N-1>(other_inputs).get_my_key_func()->clone());
             join_helper<N-1>::copy_key_functors(my_inputs, other_inputs);
         }
 
@@ -174,14 +186,16 @@
         template<typename InputTuple, typename KeyFuncTuple>
         static inline void set_key_functors(InputTuple &my_input, KeyFuncTuple &my_key_funcs) {
             std::get<0>(my_input).set_my_key_func(std::get<0>(my_key_funcs));
-            std::get<0>(my_key_funcs) = NULL;
+            std::get<0>(my_key_funcs) = nullptr;
         }
 
         template< typename KeyFuncTuple>
         static inline void copy_key_functors(KeyFuncTuple &my_inputs, KeyFuncTuple &other_inputs) {
-            if(std::get<0>(other_inputs).get_my_key_func()) {
-                std::get<0>(my_inputs).set_my_key_func(std::get<0>(other_inputs).get_my_key_func()->clone());
-            }
+            __TBB_ASSERT(
+                std::get<0>(other_inputs).get_my_key_func(),
+                "key matching join node should not be instantiated without functors."
+            );
+            std::get<0>(my_inputs).set_my_key_func(std::get<0>(other_inputs).get_my_key_func()->clone());
         }
         template<typename InputTuple>
         static inline void reset_inputs(InputTuple &my_input, reset_flags f) {
@@ -222,23 +236,27 @@
 
         void handle_operations(reserving_port_operation* op_list) {
             reserving_port_operation *current;
-            bool no_predecessors;
+            bool was_missing_predecessors = false;
             while(op_list) {
                 current = op_list;
                 op_list = op_list->next;
                 switch(current->type) {
                 case reg_pred:
-                    no_predecessors = my_predecessors.empty();
+                    was_missing_predecessors = my_predecessors.empty();
                     my_predecessors.add(*(current->my_pred));
-                    if ( no_predecessors ) {
-                        (void) my_join->decrement_port_count(true); // may try to forward
+                    if ( was_missing_predecessors ) {
+                        (void) my_join->decrement_port_count(); // may try to forward
                     }
                     current->status.store( SUCCEEDED, std::memory_order_release);
                     break;
                 case rem_pred:
-                    my_predecessors.remove(*(current->my_pred));
-                    if(my_predecessors.empty()) my_join->increment_port_count();
-                    current->status.store( SUCCEEDED, std::memory_order_release);
+                    if ( !my_predecessors.empty() ) {
+                        my_predecessors.remove(*(current->my_pred));
+                        if ( my_predecessors.empty() ) // was the last predecessor
+                            my_join->increment_port_count();
+                    }
+                    // TODO: consider returning failure if there were no predecessors to remove
+                    current->status.store( SUCCEEDED, std::memory_order_release );
                     break;
                 case res_item:
                     if ( reserved ) {
@@ -273,7 +291,7 @@
         template<typename X, typename Y> friend class broadcast_cache;
         template<typename X, typename Y> friend class round_robin_cache;
         graph_task* try_put_task( const T & ) override {
-            return NULL;
+            return nullptr;
         }
 
         graph& graph_reference() const override {
@@ -283,21 +301,14 @@
     public:
 
         //! Constructor
-        reserving_port() : reserved(false) {
-            my_join = NULL;
-            my_predecessors.set_owner( this );
+        reserving_port() : my_join(nullptr), my_predecessors(this), reserved(false) {
             my_aggregator.initialize_handler(handler_type(this));
         }
 
         // copy constructor
-        reserving_port(const reserving_port& /* other */) : receiver<T>() {
-            reserved = false;
-            my_join = NULL;
-            my_predecessors.set_owner( this );
-            my_aggregator.initialize_handler(handler_type(this));
-        }
+        reserving_port(const reserving_port& /* other */) = delete;
 
-        void set_join_node_pointer(forwarding_base *join) {
+        void set_join_node_pointer(reserving_forwarding_base *join) {
             my_join = join;
         }
 
@@ -334,7 +345,7 @@
             my_aggregator.execute(&op_data);
         }
 
-        void reset_receiver( reset_flags f) override {
+        void reset_receiver( reset_flags f) {
             if(f & rf_clear_edges) my_predecessors.clear();
             else
             my_predecessors.reset();
@@ -347,7 +358,7 @@
         friend class get_graph_helper;
 #endif
 
-        forwarding_base *my_join;
+        reserving_forwarding_base *my_join;
         reservable_predecessor_cache< T, null_mutex > my_predecessors;
         bool reserved;
     };  // reserving_port
@@ -374,16 +385,16 @@
             // constructor for value parameter
             queueing_port_operation(const T& e, op_type t) :
                 type(char(t)), my_val(e)
-                , bypass_t(NULL)
+                , bypass_t(nullptr)
             {}
             // constructor for pointer parameter
             queueing_port_operation(const T* p, op_type t) :
                 type(char(t)), my_arg(const_cast<T*>(p))
-                , bypass_t(NULL)
+                , bypass_t(nullptr)
             {}
             // constructor with no parameter
             queueing_port_operation(op_type t) : type(char(t))
-                , bypass_t(NULL)
+                , bypass_t(nullptr)
             {}
         };
 
@@ -399,7 +410,7 @@
                 op_list = op_list->next;
                 switch(current->type) {
                 case try__put_task: {
-                        graph_task* rtask = NULL;
+                        graph_task* rtask = nullptr;
                         was_empty = this->buffer_empty();
                         this->push_back(current->my_val);
                         if (was_empty) rtask = my_join->decrement_port_count(false);
@@ -451,18 +462,15 @@
 
         //! Constructor
         queueing_port() : item_buffer<T>() {
-            my_join = NULL;
+            my_join = nullptr;
             my_aggregator.initialize_handler(handler_type(this));
         }
 
         //! copy constructor
-        queueing_port(const queueing_port& /* other */) : receiver<T>(), item_buffer<T>() {
-            my_join = NULL;
-            my_aggregator.initialize_handler(handler_type(this));
-        }
+        queueing_port(const queueing_port& /* other */) = delete;
 
         //! record parent for tallying available items
-        void set_join_node_pointer(forwarding_base *join) {
+        void set_join_node_pointer(queueing_forwarding_base *join) {
             my_join = join;
         }
 
@@ -480,7 +488,7 @@
             return;
         }
 
-        void reset_receiver(reset_flags) override {
+        void reset_receiver(reset_flags) {
             item_buffer<T>::reset();
         }
 
@@ -489,7 +497,7 @@
         friend class get_graph_helper;
 #endif
 
-        forwarding_base *my_join;
+        queueing_forwarding_base *my_join;
     };  // queueing_port
 
 #include "_flow_graph_tagged_buffer_impl.h"
@@ -585,10 +593,10 @@
         template<typename X, typename Y> friend class round_robin_cache;
         graph_task* try_put_task(const input_type& v) override {
             key_matching_port_operation op_data(v, try__put);
-            graph_task* rtask = NULL;
+            graph_task* rtask = nullptr;
             my_aggregator.execute(&op_data);
             if(op_data.status == SUCCEEDED) {
-                rtask = my_join->increment_key_count((*(this->get_key_func()))(v), false);  // may spawn
+                rtask = my_join->increment_key_count((*(this->get_key_func()))(v));  // may spawn
                 // rtask has to reflect the return status of the try_put
                 if(!rtask) rtask = SUCCESSFULLY_ENQUEUED;
             }
@@ -602,16 +610,17 @@
     public:
 
         key_matching_port() : receiver<input_type>(), buffer_type() {
-            my_join = NULL;
+            my_join = nullptr;
             my_aggregator.initialize_handler(handler_type(this));
         }
 
         // copy constructor
-        key_matching_port(const key_matching_port& /*other*/) : receiver<input_type>(), buffer_type() {
-            my_join = NULL;
-            my_aggregator.initialize_handler(handler_type(this));
-        }
-
+        key_matching_port(const key_matching_port& /*other*/) = delete;
+#if __INTEL_COMPILER <= 2021
+        // Suppress superfluous diagnostic about virtual keyword absence in a destructor of an inherited
+        // class while the parent class has the virtual keyword for the destrocutor.
+        virtual
+#endif
         ~key_matching_port() { }
 
         void set_join_node_pointer(forwarding_base *join) {
@@ -637,7 +646,7 @@
             return;
         }
 
-        void reset_receiver(reset_flags ) override {
+        void reset_receiver(reset_flags ) {
             buffer_type::reset();
         }
 
@@ -657,19 +666,19 @@
     class join_node_FE;
 
     template<typename InputTuple, typename OutputTuple>
-    class join_node_FE<reserving, InputTuple, OutputTuple> : public forwarding_base {
+    class join_node_FE<reserving, InputTuple, OutputTuple> : public reserving_forwarding_base {
     public:
         static const int N = std::tuple_size<OutputTuple>::value;
         typedef OutputTuple output_type;
         typedef InputTuple input_type;
         typedef join_node_base<reserving, InputTuple, OutputTuple> base_node_type; // for forwarding
 
-        join_node_FE(graph &g) : forwarding_base(g), my_node(NULL) {
+        join_node_FE(graph &g) : reserving_forwarding_base(g), my_node(nullptr) {
             ports_with_no_inputs = N;
             join_helper<N>::set_join_node_pointer(my_inputs, this);
         }
 
-        join_node_FE(const join_node_FE& other) : forwarding_base((other.forwarding_base::graph_ref)), my_node(NULL) {
+        join_node_FE(const join_node_FE& other) : reserving_forwarding_base((other.reserving_forwarding_base::graph_ref)), my_node(nullptr) {
             ports_with_no_inputs = N;
             join_helper<N>::set_join_node_pointer(my_inputs, this);
         }
@@ -681,19 +690,17 @@
         }
 
         // if all input_ports have predecessors, spawn forward to try and consume tuples
-        graph_task* decrement_port_count(bool handle_task) override {
+        graph_task* decrement_port_count() override {
             if(ports_with_no_inputs.fetch_sub(1) == 1) {
                 if(is_graph_active(this->graph_ref)) {
                     small_object_allocator allocator{};
                     typedef forward_task_bypass<base_node_type> task_type;
                     graph_task* t = allocator.new_object<task_type>(graph_ref, allocator, *my_node);
                     graph_ref.reserve_wait();
-                    if( !handle_task )
-                        return t;
                     spawn_in_graph_arena(this->graph_ref, *t);
                 }
             }
-            return NULL;
+            return nullptr;
         }
 
         input_type &input_ports() { return my_inputs; }
@@ -730,19 +737,19 @@
     };  // join_node_FE<reserving, ... >
 
     template<typename InputTuple, typename OutputTuple>
-    class join_node_FE<queueing, InputTuple, OutputTuple> : public forwarding_base {
+    class join_node_FE<queueing, InputTuple, OutputTuple> : public queueing_forwarding_base {
     public:
         static const int N = std::tuple_size<OutputTuple>::value;
         typedef OutputTuple output_type;
         typedef InputTuple input_type;
         typedef join_node_base<queueing, InputTuple, OutputTuple> base_node_type; // for forwarding
 
-        join_node_FE(graph &g) : forwarding_base(g), my_node(NULL) {
+        join_node_FE(graph &g) : queueing_forwarding_base(g), my_node(nullptr) {
             ports_with_no_items = N;
             join_helper<N>::set_join_node_pointer(my_inputs, this);
         }
 
-        join_node_FE(const join_node_FE& other) : forwarding_base((other.forwarding_base::graph_ref)), my_node(NULL) {
+        join_node_FE(const join_node_FE& other) : queueing_forwarding_base((other.queueing_forwarding_base::graph_ref)), my_node(nullptr) {
             ports_with_no_items = N;
             join_helper<N>::set_join_node_pointer(my_inputs, this);
         }
@@ -771,8 +778,6 @@
             return nullptr;
         }
 
-        void increment_port_count() override { __TBB_ASSERT(false, NULL); }  // should never be called
-
         input_type &input_ports() { return my_inputs; }
 
     protected:
@@ -853,14 +858,12 @@
             unref_key_type my_val;
             output_type* my_output;
             graph_task* bypass_t;
-            bool enqueue_task;
             // constructor for value parameter
-            key_matching_FE_operation(const unref_key_type& e , bool q_task , op_type t) : type(char(t)), my_val(e),
-                 my_output(NULL), bypass_t(NULL), enqueue_task(q_task) {}
-            key_matching_FE_operation(output_type *p, op_type t) : type(char(t)), my_output(p), bypass_t(NULL),
-                 enqueue_task(true) {}
+            key_matching_FE_operation(const unref_key_type& e , op_type t) : type(char(t)), my_val(e),
+                 my_output(nullptr), bypass_t(nullptr) {}
+            key_matching_FE_operation(output_type *p, op_type t) : type(char(t)), my_output(p), bypass_t(nullptr) {}
             // constructor with no parameter
-            key_matching_FE_operation(op_type t) : type(char(t)), my_output(NULL), bypass_t(NULL), enqueue_task(true) {}
+            key_matching_FE_operation(op_type t) : type(char(t)), my_output(nullptr), bypass_t(nullptr) {}
         };
 
         typedef aggregating_functor<class_type, key_matching_FE_operation> handler_type;
@@ -869,11 +872,11 @@
 
         // called from aggregator, so serialized
         // returns a task pointer if the a task would have been enqueued but we asked that
-        // it be returned.  Otherwise returns NULL.
-        graph_task* fill_output_buffer(unref_key_type &t, bool should_enqueue, bool handle_task) {
+        // it be returned.  Otherwise returns nullptr.
+        graph_task* fill_output_buffer(unref_key_type &t) {
             output_type l_out;
-            graph_task* rtask = NULL;
-            bool do_fwd = should_enqueue && this->buffer_empty() && is_graph_active(this->graph_ref);
+            graph_task* rtask = nullptr;
+            bool do_fwd = this->buffer_empty() && is_graph_active(this->graph_ref);
             this->current_key = t;
             this->delete_with_key(this->current_key);   // remove the key
             if(join_helper<N>::get_items(my_inputs, l_out)) {  //  <== call back
@@ -883,12 +886,6 @@
                     typedef forward_task_bypass<base_node_type> task_type;
                     rtask = allocator.new_object<task_type>(this->graph_ref, allocator, *my_node);
                     this->graph_ref.reserve_wait();
-                    if( handle_task ) {
-                        // TODO revamp: make spawn_in_graph_arena extract reference to the graph
-                        // from the passed task
-                        spawn_in_graph_arena(this->graph_ref, *rtask);
-                        rtask = NULL;
-                    }
                     do_fwd = false;
                 }
                 // retire the input values
@@ -915,20 +912,16 @@
                 case inc_count: {  // called from input ports
                         count_element_type *p = 0;
                         unref_key_type &t = current->my_val;
-                        bool do_enqueue = current->enqueue_task;
                         if(!(this->find_ref_with_key(t,p))) {
                             count_element_type ev;
                             ev.my_key = t;
                             ev.my_value = 0;
                             this->insert_with_key(ev);
-                            if(!(this->find_ref_with_key(t,p))) {
-                                __TBB_ASSERT(false, "should find key after inserting it");
-                            }
+                            bool found = this->find_ref_with_key(t, p);
+                            __TBB_ASSERT_EX(found, "should find key after inserting it");
                         }
                         if(++(p->my_value) == size_t(N)) {
-                            graph_task* rtask = fill_output_buffer(t, true, do_enqueue);
-                            __TBB_ASSERT(!rtask || !do_enqueue, "task should not be returned");
-                            current->bypass_t = rtask;
+                            current->bypass_t = fill_output_buffer(t);
                         }
                     }
                     current->status.store( SUCCEEDED, std::memory_order_release);
@@ -952,7 +945,7 @@
 
     public:
         template<typename FunctionTuple>
-        join_node_FE(graph &g, FunctionTuple &TtoK_funcs) : forwarding_base_type(g), my_node(NULL) {
+        join_node_FE(graph &g, FunctionTuple &TtoK_funcs) : forwarding_base_type(g), my_node(nullptr) {
             join_helper<N>::set_join_node_pointer(my_inputs, this);
             join_helper<N>::set_key_functors(my_inputs, TtoK_funcs);
             my_aggregator.initialize_handler(handler_type(this));
@@ -962,7 +955,7 @@
 
         join_node_FE(const join_node_FE& other) : forwarding_base_type((other.forwarding_base_type::graph_ref)), key_to_count_buffer_type(),
         output_buffer_type() {
-            my_node = NULL;
+            my_node = nullptr;
             join_helper<N>::set_join_node_pointer(my_inputs, this);
             join_helper<N>::copy_key_functors(my_inputs, const_cast<input_type &>(other.my_inputs));
             my_aggregator.initialize_handler(handler_type(this));
@@ -981,16 +974,12 @@
 
         // if all input_ports have items, spawn forward to try and consume tuples
         // return a task if we are asked and did create one.
-        graph_task *increment_key_count(unref_key_type const & t, bool handle_task) override {  // called from input_ports
-            key_matching_FE_operation op_data(t, handle_task, inc_count);
+        graph_task *increment_key_count(unref_key_type const & t) override {  // called from input_ports
+            key_matching_FE_operation op_data(t, inc_count);
             my_aggregator.execute(&op_data);
             return op_data.bypass_t;
         }
 
-        graph_task *decrement_port_count(bool /*handle_task*/) override { __TBB_ASSERT(false, NULL); return NULL; }
-
-        void increment_port_count() override { __TBB_ASSERT(false, NULL); }  // should never be called
-
         input_type &input_ports() { return my_inputs; }
 
     protected:
@@ -1062,10 +1051,10 @@
             };
             graph_task* bypass_t;
             join_node_base_operation(const output_type& e, op_type t) : type(char(t)),
-                my_arg(const_cast<output_type*>(&e)), bypass_t(NULL) {}
+                my_arg(const_cast<output_type*>(&e)), bypass_t(nullptr) {}
             join_node_base_operation(const successor_type &s, op_type t) : type(char(t)),
-                my_succ(const_cast<successor_type *>(&s)), bypass_t(NULL) {}
-            join_node_base_operation(op_type t) : type(char(t)), bypass_t(NULL) {}
+                my_succ(const_cast<successor_type *>(&s)), bypass_t(nullptr) {}
+            join_node_base_operation(op_type t) : type(char(t)), bypass_t(nullptr) {}
         };
 
         typedef aggregating_functor<class_type, join_node_base_operation> handler_type;
@@ -1108,7 +1097,7 @@
                     break;
                 case do_fwrd_bypass: {
                         bool build_succeeded;
-                        graph_task *last_task = NULL;
+                        graph_task *last_task = nullptr;
                         output_type out;
                         // forwarding must be exclusive, because try_to_make_tuple and tuple_accepted
                         // are separate locked methods in the FE.  We could conceivably fetch the front
@@ -1143,23 +1132,25 @@
         }
         // ---------- end aggregator -----------
     public:
-        join_node_base(graph &g) : graph_node(g), input_ports_type(g), forwarder_busy(false) {
-            my_successors.set_owner(this);
+        join_node_base(graph &g)
+            : graph_node(g), input_ports_type(g), forwarder_busy(false), my_successors(this)
+        {
             input_ports_type::set_my_node(this);
             my_aggregator.initialize_handler(handler_type(this));
         }
 
         join_node_base(const join_node_base& other) :
             graph_node(other.graph_node::my_graph), input_ports_type(other),
-            sender<OutputTuple>(), forwarder_busy(false), my_successors() {
-            my_successors.set_owner(this);
+            sender<OutputTuple>(), forwarder_busy(false), my_successors(this)
+        {
             input_ports_type::set_my_node(this);
             my_aggregator.initialize_handler(handler_type(this));
         }
 
         template<typename FunctionTuple>
-        join_node_base(graph &g, FunctionTuple f) : graph_node(g), input_ports_type(g, f), forwarder_busy(false) {
-            my_successors.set_owner(this);
+        join_node_base(graph &g, FunctionTuple f)
+            : graph_node(g), input_ports_type(g, f), forwarder_busy(false), my_successors(this)
+        {
             input_ports_type::set_my_node(this);
             my_aggregator.initialize_handler(handler_type(this));
         }
diff --git a/include/tbb/detail/_flow_graph_node_impl.h b/include/tbb/detail/_flow_graph_node_impl.h
index b16872c311..b57405e991 100644
--- a/include/tbb/detail/_flow_graph_node_impl.h
+++ b/include/tbb/detail/_flow_graph_node_impl.h
@@ -34,10 +34,6 @@ class function_input_queue : public item_buffer<T,A> {
         return this->item_buffer<T, A>::front();
     }
 
-    bool pop( T& t ) {
-        return this->pop_front( t );
-    }
-
     void pop() {
         this->destroy_front();
     }
@@ -67,27 +63,19 @@ class function_input_base : public receiver<Input>, no_assign {
     static_assert(!has_policy<queueing, Policy>::value || !has_policy<rejecting, Policy>::value, "");
 
     //! Constructor for function_input_base
-    function_input_base(
-        graph &g, size_t max_concurrency, node_priority_t a_priority)
-      : my_graph_ref(g), my_max_concurrency(max_concurrency)
-      , my_concurrency(0), my_priority(a_priority)
-      , my_queue(!has_policy<rejecting, Policy>::value ? new input_queue_type() : NULL)
-      , forwarder_busy(false)
+    function_input_base( graph &g, size_t max_concurrency, node_priority_t a_priority )
+        : my_graph_ref(g), my_max_concurrency(max_concurrency)
+        , my_concurrency(0), my_priority(a_priority)
+        , my_queue(!has_policy<rejecting, Policy>::value ? new input_queue_type() : NULL)
+        , my_predecessors(this)
+        , forwarder_busy(false)
     {
-        my_predecessors.set_owner(this);
         my_aggregator.initialize_handler(handler_type(this));
     }
 
     //! Copy constructor
-    function_input_base( const function_input_base& src)
-        : receiver<Input>(), no_assign()
-        , my_graph_ref(src.my_graph_ref), my_max_concurrency(src.my_max_concurrency)
-        , my_concurrency(0), my_priority(src.my_priority)
-        , my_queue(src.my_queue ? new input_queue_type() : NULL), forwarder_busy(false)
-    {
-        my_predecessors.set_owner(this);
-        my_aggregator.initialize_handler(handler_type(this));
-    }
+    function_input_base( const function_input_base& src )
+        : function_input_base(src.my_graph_ref, src.my_max_concurrency, src.my_priority) {}
 
     //! Destructor
     // The queue is allocated by the constructor for {multi}function_node.
@@ -135,7 +123,7 @@ class function_input_base : public receiver<Input>, no_assign {
     input_queue_type *my_queue;
     predecessor_cache<input_type, null_mutex > my_predecessors;
 
-    void reset_receiver( reset_flags f) override {
+    void reset_receiver( reset_flags f) {
         if( f & rf_clear_edges) my_predecessors.clear();
         else
             my_predecessors.reset();
@@ -255,7 +243,7 @@ class function_input_base : public receiver<Input>, no_assign {
     //! Creates tasks for postponed messages if available and if concurrency allows
     void internal_forward(operation_type *op) {
         op->bypass_t = NULL;
-        if (my_concurrency < my_max_concurrency || !my_max_concurrency)
+        if (my_concurrency < my_max_concurrency)
             op->bypass_t = perform_queued_requests();
         if(op->bypass_t)
             op->status.store(SUCCEEDED, std::memory_order_release);
@@ -379,7 +367,7 @@ class function_input : public function_input_base<Input, Policy, A, function_inp
         my_body( src.my_init_body->clone() ),
         my_init_body(src.my_init_body->clone() ) {
     }
-#if __INTEL_COMPILER == 2021
+#if __INTEL_COMPILER <= 2021
     // Suppress superfluous diagnostic about virtual keyword absence in a destructor of an inherited
     // class while the parent class has the virtual keyword for the destrocutor.
     virtual
@@ -459,20 +447,24 @@ template<int N> struct clear_element {
         (void)std::get<N-1>(p).successors().clear();
         clear_element<N-1>::clear_this(p);
     }
+#if TBB_USE_ASSERT
     template<typename P> static bool this_empty(P &p) {
         if(std::get<N-1>(p).successors().empty())
             return clear_element<N-1>::this_empty(p);
         return false;
     }
+#endif
 };
 
 template<> struct clear_element<1> {
     template<typename P> static void clear_this(P &p) {
         (void)std::get<0>(p).successors().clear();
     }
+#if TBB_USE_ASSERT
     template<typename P> static bool this_empty(P &p) {
         return std::get<0>(p).successors().empty();
     }
+#endif
 };
 
 template <typename OutputTuple>
@@ -706,10 +698,8 @@ class function_output : public sender<Output> {
     typedef typename sender<output_type>::successor_type successor_type;
     typedef broadcast_cache<output_type> broadcast_cache_type;
 
-    function_output( graph& g) : my_graph_ref(g) { my_successors.set_owner(this); }
-    function_output(const function_output & other) : sender<output_type>(), my_graph_ref(other.my_graph_ref) {
-        my_successors.set_owner(this);
-    }
+    function_output(graph& g) : my_successors(this), my_graph_ref(g) {}
+    function_output(const function_output& other) = delete;
 
     //! Adds a new successor to this node
     bool register_successor( successor_type &r ) override {
@@ -723,18 +713,6 @@ class function_output : public sender<Output> {
         return true;
     }
 
-    // for multifunction_node.  The function_body that implements
-    // the node will have an input and an output tuple of ports.  To put
-    // an item to a successor, the body should
-    //
-    //    get<I>(output_ports).try_put(output_value);
-    //
-    // if task pointer is returned will always spawn and return true, else
-    // return value will be bool returned from successors.try_put.
-    graph_task *try_put_task(const output_type &i) { // not a virtual method in this class
-        return my_successors.try_put_task(i);
-    }
-
     broadcast_cache_type &successors() { return my_successors; }
 
     graph& graph_reference() const { return my_graph_ref; }
@@ -750,8 +728,8 @@ class multifunction_output : public function_output<Output> {
     typedef function_output<output_type> base_type;
     using base_type::my_successors;
 
-    multifunction_output(graph& g) : base_type(g) {my_successors.set_owner(this);}
-    multifunction_output( const multifunction_output& other) : base_type(other.my_graph_ref) { my_successors.set_owner(this); }
+    multifunction_output(graph& g) : base_type(g) {}
+    multifunction_output(const multifunction_output& other) : base_type(other.my_graph_ref) {}
 
     bool try_put(const output_type &i) {
         graph_task *res = try_put_task(i);
diff --git a/include/tbb/detail/_flow_graph_trace_impl.h b/include/tbb/detail/_flow_graph_trace_impl.h
index e7709a5b01..d214cee4e4 100644
--- a/include/tbb/detail/_flow_graph_trace_impl.h
+++ b/include/tbb/detail/_flow_graph_trace_impl.h
@@ -312,8 +312,6 @@ static inline void fgt_node_desc( const NodeType * /*node*/, const char * /*desc
 
 static inline void fgt_graph_desc( const void * /*g*/, const char * /*desc*/ ) { }
 
-static inline void fgt_body( void * /*node*/, void * /*body*/ ) { }
-
 template< int N, typename PortsTuple >
 static inline void fgt_multioutput_node( void* /*codeptr*/, string_resource_index /*t*/, void * /*g*/, void * /*input_port*/, PortsTuple & /*ports*/ ) { }
 
@@ -325,7 +323,6 @@ static inline void fgt_multiinput_node( void* /*codeptr*/, string_resource_index
 
 static inline void fgt_multiinput_multioutput_node( void* /*codeptr*/, string_resource_index /*t*/, void * /*node*/, void * /*graph*/ ) { }
 
-static inline void fgt_node( void* /*codeptr*/, string_resource_index /*t*/, void * /*g*/, void * /*output_port*/ ) { }
 static inline void fgt_node( void* /*codeptr*/, string_resource_index /*t*/, void * /*g*/, void * /*input_port*/, void * /*output_port*/ ) { }
 static inline void  fgt_node( void* /*codeptr*/, string_resource_index /*t*/, void * /*g*/, void * /*input_port*/, void * /*decrement_port*/, void * /*output_port*/ ) { }
 
diff --git a/include/tbb/detail/_flow_graph_types_impl.h b/include/tbb/detail/_flow_graph_types_impl.h
index d374321fe7..a4f15a5618 100644
--- a/include/tbb/detail/_flow_graph_types_impl.h
+++ b/include/tbb/detail/_flow_graph_types_impl.h
@@ -73,50 +73,6 @@ struct make_sequence < 0, S... > {
     typedef sequence<S...> type;
 };
 
-// Until C++14 std::initializer_list does not guarantee life time of contained objects.
-template <typename T>
-class initializer_list_wrapper {
-public:
-    typedef T value_type;
-    typedef const T& reference;
-    typedef const T& const_reference;
-    typedef size_t size_type;
-
-    typedef T* iterator;
-    typedef const T* const_iterator;
-
-    initializer_list_wrapper( std::initializer_list<T> il ) noexcept : my_begin( static_cast<T*>(malloc( il.size()*sizeof( T ) )) ) {
-        iterator dst = my_begin;
-        for ( typename std::initializer_list<T>::const_iterator src = il.begin(); src != il.end(); ++src )
-            new (dst++) T( *src );
-        my_end = dst;
-    }
-
-    initializer_list_wrapper( const initializer_list_wrapper<T>& ilw ) noexcept : my_begin( static_cast<T*>(malloc( ilw.size()*sizeof( T ) )) ) {
-        iterator dst = my_begin;
-        for ( typename std::initializer_list<T>::const_iterator src = ilw.begin(); src != ilw.end(); ++src )
-            new (dst++) T( *src );
-        my_end = dst;
-    }
-
-    initializer_list_wrapper( initializer_list_wrapper<T>&& ilw ) noexcept : my_begin( ilw.my_begin ), my_end( ilw.my_end ) {
-        ilw.my_begin = ilw.my_end = NULL;
-    }
-
-    ~initializer_list_wrapper() {
-        if ( my_begin )
-            free( my_begin );
-    }
-
-    const_iterator begin() const noexcept { return my_begin; }
-    const_iterator end() const noexcept { return my_end; }
-    size_t size() const noexcept { return (size_t)(my_end - my_begin); }
-
-private:
-    iterator my_begin;
-    iterator my_end;
-};
-
 //! type mimicking std::pair but with trailing fill to ensure each element of an array
 //* will have the correct alignment
 template<typename T1, typename T2, size_t REM>
@@ -163,7 +119,7 @@ struct default_constructed { };
 // struct to allow us to copy and test the type of objects
 struct WrapperBase {
     virtual ~WrapperBase() {}
-    virtual void CopyTo(void* /*newSpace*/) const {  }
+    virtual void CopyTo(void* /*newSpace*/) const = 0;
 };
 
 // Wrapper<T> contains a T, with the ability to test what T is.  The Wrapper<T> can be
@@ -191,7 +147,7 @@ struct Wrapper: public WrapperBase {
     };
 public:
     explicit Wrapper( const T& other ) : value_space(other) { }
-    explicit Wrapper(const Wrapper& other) : value_space(other.value_space) { }
+    explicit Wrapper(const Wrapper& other) = delete;
 
     void CopyTo(void* newSpace) const override {
         _unwind_space guard((pointer_type)newSpace);
diff --git a/include/tbb/detail/_pipeline_filters.h b/include/tbb/detail/_pipeline_filters.h
index 949610cb51..1a5d402725 100644
--- a/include/tbb/detail/_pipeline_filters.h
+++ b/include/tbb/detail/_pipeline_filters.h
@@ -318,7 +318,6 @@ class filter_node_ptr {
     void operator=(filter_node *);
     void operator=(const filter_node_ptr &);
     void operator=(filter_node_ptr &&);
-    filter_node* operator->() const;
     filter_node& operator*() const;
     operator bool() const;
 };
@@ -418,10 +417,6 @@ inline void filter_node_ptr::operator=(filter_node_ptr && rhs) {
     }
 }
 
-inline filter_node* filter_node_ptr::operator->() const {
-    return my_node;
-}
-
 inline filter_node& filter_node_ptr::operator*() const{
     __TBB_ASSERT(my_node,"NULL node is used");
     return *my_node;
diff --git a/include/tbb/detail/_range_common.h b/include/tbb/detail/_range_common.h
index 0ebbc1579f..71ffd8d862 100644
--- a/include/tbb/detail/_range_common.h
+++ b/include/tbb/detail/_range_common.h
@@ -44,12 +44,31 @@ class proportional_split : no_assign {
     size_t right() const { return my_right; }
 
     // used when range does not support proportional split
-    operator split() const { return split(); }
+    explicit operator split() const { return split(); }
 
 private:
     size_t my_left, my_right;
 };
 
+template <typename Range, typename = void>
+struct range_split_object_provider {
+    template <typename PartitionerSplitType>
+    static split get( PartitionerSplitType& ) { return split(); }
+};
+
+template <typename Range>
+struct range_split_object_provider<Range,
+                                   typename std::enable_if<std::is_constructible<Range, Range&, proportional_split&>::value>::type> {
+    template <typename PartitionerSplitType>
+    static PartitionerSplitType& get( PartitionerSplitType& split_obj ) { return split_obj; }
+};
+
+template <typename Range, typename PartitionerSplitType>
+auto get_range_split_object( PartitionerSplitType& split_obj )
+-> decltype(range_split_object_provider<Range>::get(split_obj)) {
+    return range_split_object_provider<Range>::get(split_obj);
+}
+
 } // namespace d0
 } // namespace detail
 } // namespace tbb
diff --git a/include/tbb/detail/_task.h b/include/tbb/detail/_task.h
index 63e0e9f9b2..556bc27146 100644
--- a/include/tbb/detail/_task.h
+++ b/include/tbb/detail/_task.h
@@ -29,6 +29,7 @@
 #include <climits>
 #include <utility>
 #include <atomic>
+#include <mutex>
 
 namespace tbb {
 namespace detail {
@@ -60,12 +61,14 @@ using suspend_callback_type = void(*)(void*, suspend_point_type*);
 void __TBB_EXPORTED_FUNC suspend(suspend_callback_type suspend_callback, void* user_callback);
 void __TBB_EXPORTED_FUNC resume(suspend_point_type* tag);
 suspend_point_type* __TBB_EXPORTED_FUNC current_suspend_point();
+void __TBB_EXPORTED_FUNC notify_waiters(d1::wait_context& wc);
 
 class thread_data;
 class task_dispatcher;
 class external_waiter;
 struct task_accessor;
 struct task_arena_impl;
+struct wait_node;
 } // namespace r1
 
 namespace d1 {
@@ -92,36 +95,64 @@ inline void resume(suspend_point tag) {
 }
 #endif /* __TBB_RESUMABLE_TASKS */
 
+// TODO align wait_context on cache lane
 class wait_context {
-    static constexpr std::uint64_t abandon_wait_flag = 1LLU << 33;
-    static constexpr std::uint64_t overflow_mask = ~((1LLU << 32) - 1) & ~abandon_wait_flag;
-
-    std::uint64_t m_version_and_traits{};
+    // The flag works as a lock for the wait_context
+    // All functions use this lock to work with the wait list
+    static constexpr std::uint64_t lock_flag = 1LLU << 34;
+    // The flag signals to the last decrimenting thread to proceed wait list
+    static constexpr std::uint64_t waiter_flag = 1LLU << 33;
+    static constexpr std::uint64_t overflow_mask = ~((1LLU << 32) - 1) & ~(lock_flag | waiter_flag);
+    using lock_guard = std::lock_guard<wait_context>;
+
+    std::uint64_t m_version_and_traits{1};
     std::atomic<std::uint64_t> m_ref_count{};
-    suspend_point m_waiting_coroutine{};
+    // Pointer to the head of the wait list
+    r1::wait_node* m_wait_head{nullptr};
 
-    void abandon_wait() {
-        __TBB_ASSERT((m_ref_count.load(std::memory_order_relaxed) & abandon_wait_flag) == 0, "The wait object can be abandoned only once");
-        add_reference(abandon_wait_flag);
-    }
+    bool is_locked();
+    void lock();
+    void unlock();
+
+    bool publish_wait_list();
+
+    template <typename F>
+    bool try_register_waiter(r1::wait_node& waiter, F&& condition);
+
+    void unregister_waiter(r1::wait_node& waiter);
+
+    void notify_waiters();
 
     void add_reference(std::int64_t delta) {
         call_itt_task_notify(releasing, this);
         std::uint64_t r = m_ref_count.fetch_add(delta) + delta;
+
         __TBB_ASSERT_EX((r & overflow_mask) == 0, "Overflow is detected");
-        if (r == abandon_wait_flag) {
-            // There is no any more references but the waiting stack is
-            // suspended (abandoned) so resume it.
-            __TBB_ASSERT(m_waiting_coroutine != nullptr, nullptr);
-            m_ref_count.store(0, std::memory_order_relaxed);
-            r1::resume(m_waiting_coroutine);
+
+        if ((r & ~lock_flag) == waiter_flag) {
+            // Some external waiters or coroutine waiters sleep in wait list
+            // Should to notify them that work is done
+            r1::notify_waiters(*this);
         }
     }
 
     bool continue_execution() const {
         std::uint64_t r = m_ref_count.load(std::memory_order_acquire);
         __TBB_ASSERT_EX((r & overflow_mask) == 0, "Overflow is detected");
-        return (r & ~abandon_wait_flag) > 0;
+        return (r & ~(waiter_flag | lock_flag)) > 0;
+    }
+
+    void wait_for_notification_completion() {
+        // This function is preventing a couple of races:
+        // - wait_context might be still locked with notifying thread when notified thread destroys the wait_context
+        //   e.g. lock or unlock might access destroyed object
+        // - wait_context might again reach zero ref_count while notifying thread still in a process
+        //   i.e. currently notifying thread might remove lock and waiter flags and the next notifiyng thread might face the first issue
+
+        atomic_backoff backoff;
+        while (!continue_execution() && m_ref_count.load(std::memory_order_relaxed) & (lock_flag | waiter_flag)) {
+            backoff.pause();
+        }
     }
 
     friend class r1::thread_data;
@@ -129,15 +160,26 @@ class wait_context {
     friend class r1::external_waiter;
     friend class task_group;
     friend class task_group_base;
+    friend class std::lock_guard<wait_context>;
     friend struct r1::task_arena_impl;
     friend struct r1::suspend_point_type;
+    friend void r1::notify_waiters(d1::wait_context& wc);
 public:
     // Despite the internal reference count is uin64_t we limit the user interface with uint32_t
     // to preserve a part of the internal reference count for special needs.
     wait_context(std::uint32_t ref_count) : m_ref_count{ref_count} { suppress_unused_warning(m_version_and_traits); }
     wait_context(const wait_context&) = delete;
 
+    ~wait_context() {
+        __TBB_ASSERT(!continue_execution(), NULL);
+        // Wait until all notifiyng threads leave wait_context
+        wait_for_notification_completion();
+    }
+
     void reserve(std::uint32_t delta = 1) {
+        // Wait until the notifiyng thread completes nofications
+        // To prevent calling notify_waiters by more than one thread
+        wait_for_notification_completion();
         add_reference(delta);
     }
 
@@ -223,9 +265,6 @@ class alignas(task_alignment) task : public task_traits {
     virtual task* execute(execution_data&) = 0;
     virtual task* cancel(execution_data&) = 0;
 
-    // TODO: remove in Gold release
-    static task_group_context* current_execute_data() { return current_context(); }
-
 private:
     std::uint64_t m_reserved[5]{};
 
diff --git a/include/tbb/detail/_template_helpers.h b/include/tbb/detail/_template_helpers.h
index 9f213956ce..7967df6b26 100644
--- a/include/tbb/detail/_template_helpers.h
+++ b/include/tbb/detail/_template_helpers.h
@@ -313,13 +313,7 @@ struct stored_pack<T, Types...> : stored_pack<Types...>
             std::forward<Preceding>(params)... , pack.leftmost_value
         );
     }
-    template< typename Ret, typename F, typename... Preceding >
-    static Ret call(F&& f, const pack_type& pack, Preceding&&... params) {
-        return pack_remainder::template call<Ret>(
-            std::forward<F>(f), static_cast<const pack_remainder&>(pack),
-            std::forward<Preceding>(params)... , pack.leftmost_value
-        );
-    }
+
     template< typename Ret, typename F, typename... Preceding >
     static Ret call(F&& f, pack_type&& pack, Preceding&&... params) {
         return pack_remainder::template call<Ret>(
diff --git a/include/tbb/detail/_utils.h b/include/tbb/detail/_utils.h
index c78a65d395..7b7bb9291c 100644
--- a/include/tbb/detail/_utils.h
+++ b/include/tbb/detail/_utils.h
@@ -272,11 +272,6 @@ void run_initializer(const Functor& f, std::atomic<do_once_state>& state ) {
     state.store(do_once_state::executed, std::memory_order_release);
 }
 
-// Run the initializer which can require repeated call
-inline void run_initializer( bool (*f)(), std::atomic<do_once_state>& state ) {
-    state.store(f() ? do_once_state::executed : do_once_state::uninitialized, std::memory_order_release);
-}
-
 } // namespace d0
 
 namespace d1 {
diff --git a/include/tbb/enumerable_thread_specific.h b/include/tbb/enumerable_thread_specific.h
index 3f947ac1f2..20227af466 100644
--- a/include/tbb/enumerable_thread_specific.h
+++ b/include/tbb/enumerable_thread_specific.h
@@ -529,11 +529,9 @@ class segmented_iterator
     // assignment
     template<typename U>
     segmented_iterator& operator=( const segmented_iterator<SegmentedContainer, U>& other) {
-        if(this != &other) {
-            my_segcont = other.my_segcont;
-            outer_iter = other.outer_iter;
-            if(outer_iter != my_segcont->end()) inner_iter = other.inner_iter;
-        }
+        my_segcont = other.my_segcont;
+        outer_iter = other.outer_iter;
+        if(outer_iter != my_segcont->end()) inner_iter = other.inner_iter;
         return *this;
     }
 
@@ -638,7 +636,6 @@ template<typename T, typename Finit>
 struct construct_by_finit: no_assign {
     Finit f;
     void construct(void* where) {new(where) T(f());}
-    construct_by_finit( const Finit& f_ ) : f(f_) {}
     construct_by_finit( Finit&& f_ ) : f(std::move(f_)) {}
 };
 
diff --git a/include/tbb/flow_graph.h b/include/tbb/flow_graph.h
index f71bfabaea..86602affa1 100644
--- a/include/tbb/flow_graph.h
+++ b/include/tbb/flow_graph.h
@@ -209,9 +209,6 @@ class receiver {
     virtual graph_task *try_put_task(const T& t) = 0;
     virtual graph& graph_reference() const = 0;
 
-    //! put receiver back in initial state
-    virtual void reset_receiver(reset_flags f = rf_reset_protocol) = 0;
-
     template<typename TT, typename M> friend class successor_cache;
     virtual bool is_continue_receiver() { return false; }
 
@@ -307,7 +304,7 @@ class continue_receiver : public receiver< continue_msg > {
     // error in gcc 4.1.2
     template<typename U, typename V> friend class limiter_node;
 
-    void reset_receiver( reset_flags f ) override {
+    virtual void reset_receiver( reset_flags f ) {
         my_current_count = 0;
         if (f & rf_clear_edges) {
             my_predecessor_count = my_initial_predecessor_count;
@@ -438,7 +435,7 @@ inline void graph::reset( reset_flags f ) {
     // reset context
     deactivate_graph(*this);
 
-    if(my_context) my_context->reset();
+    my_context->reset();
     cancelled = false;
     caught_exception = false;
     // reset all the nodes comprising the graph
@@ -492,34 +489,33 @@ class input_node : public graph_node, public sender< Output > {
     //! Constructor for a node with a successor
     template< typename Body >
      __TBB_NOINLINE_SYM input_node( graph &g, Body body )
-        : graph_node(g), my_active(false),
-        my_body( new input_body_leaf< output_type, Body>(body) ),
-        my_init_body( new input_body_leaf< output_type, Body>(body) ),
-        my_reserved(false), my_has_cached_item(false)
+         : graph_node(g), my_active(false)
+         , my_body( new input_body_leaf< output_type, Body>(body) )
+         , my_init_body( new input_body_leaf< output_type, Body>(body) )
+         , my_successors(this), my_reserved(false), my_has_cached_item(false)
     {
-        my_successors.set_owner(this);
-        fgt_node_with_body( CODEPTR(), FLOW_INPUT_NODE, &this->my_graph,
-                                           static_cast<sender<output_type> *>(this), this->my_body );
+        fgt_node_with_body(CODEPTR(), FLOW_INPUT_NODE, &this->my_graph,
+                           static_cast<sender<output_type> *>(this), this->my_body);
     }
 
 #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
     template <typename Body, typename... Successors>
     input_node( const node_set<order::preceding, Successors...>& successors, Body body )
-        : input_node(successors.graph_reference(), body) {
+        : input_node(successors.graph_reference(), body)
+    {
         make_edges(*this, successors);
     }
 #endif
 
     //! Copy constructor
-    __TBB_NOINLINE_SYM input_node( const input_node& src ) :
-        graph_node(src.my_graph), sender<Output>(),
-        my_active(false),
-        my_body( src.my_init_body->clone() ), my_init_body(src.my_init_body->clone() ),
-        my_reserved(false), my_has_cached_item(false)
+    __TBB_NOINLINE_SYM input_node( const input_node& src )
+        : graph_node(src.my_graph), sender<Output>()
+        , my_active(false)
+        , my_body(src.my_init_body->clone()), my_init_body(src.my_init_body->clone())
+        , my_successors(this), my_reserved(false), my_has_cached_item(false)
     {
-        my_successors.set_owner(this);
         fgt_node_with_body(CODEPTR(), FLOW_INPUT_NODE, &this->my_graph,
-                                           static_cast<sender<output_type> *>(this), this->my_body );
+                           static_cast<sender<output_type> *>(this), this->my_body);
     }
 
     //! The destructor
@@ -897,7 +893,6 @@ class split_node : public graph_node, public receiver<TupleType> {
 
         __TBB_ASSERT(!(f & rf_clear_edges) || clear_element<N>::this_empty(my_output_ports), "split_node reset failed");
     }
-    void reset_receiver(reset_flags /*f*/) override {}
     graph& graph_reference() const override {
         return my_graph;
     }
@@ -1013,10 +1008,9 @@ class broadcast_node : public graph_node, public receiver<T>, public sender<T> {
     broadcast_cache<input_type> my_successors;
 public:
 
-    __TBB_NOINLINE_SYM explicit broadcast_node(graph& g) : graph_node(g) {
-        my_successors.set_owner( this );
+    __TBB_NOINLINE_SYM explicit broadcast_node(graph& g) : graph_node(g), my_successors(this) {
         fgt_node( CODEPTR(), FLOW_BROADCAST_NODE, &this->my_graph,
-                                 static_cast<receiver<input_type> *>(this), static_cast<sender<output_type> *>(this) );
+                  static_cast<receiver<input_type> *>(this), static_cast<sender<output_type> *>(this) );
     }
 
 #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
@@ -1027,13 +1021,7 @@ class broadcast_node : public graph_node, public receiver<T>, public sender<T> {
 #endif
 
     // Copy constructor
-    __TBB_NOINLINE_SYM broadcast_node( const broadcast_node& src ) :
-        graph_node(src.my_graph), receiver<T>(), sender<T>()
-    {
-        my_successors.set_owner( this );
-        fgt_node( CODEPTR(), FLOW_BROADCAST_NODE, &this->my_graph,
-                                 static_cast<receiver<input_type> *>(this), static_cast<sender<output_type> *>(this) );
-    }
+    __TBB_NOINLINE_SYM broadcast_node( const broadcast_node& src ) : broadcast_node(src.my_graph) {}
 
     //! Adds a successor
     bool register_successor( successor_type &r ) override {
@@ -1062,8 +1050,6 @@ class broadcast_node : public graph_node, public receiver<T>, public sender<T> {
         return my_graph;
     }
 
-    void reset_receiver(reset_flags /*f*/) override {}
-
     void reset_node(reset_flags f) override {
         if (f&rf_clear_edges) {
            my_successors.clear();
@@ -1291,9 +1277,8 @@ class buffer_node
     //! Constructor
     __TBB_NOINLINE_SYM explicit buffer_node( graph &g )
         : graph_node(g), reservable_item_buffer<T, internals_allocator>(), receiver<T>(),
-          sender<T>(), forwarder_busy(false)
+          sender<T>(), my_successors(this), forwarder_busy(false)
     {
-        my_successors.set_owner(this);
         my_aggregator.initialize_handler(handler_type(this));
         fgt_node( CODEPTR(), FLOW_BUFFER_NODE, &this->my_graph,
                                  static_cast<receiver<input_type> *>(this), static_cast<sender<output_type> *>(this) );
@@ -1307,15 +1292,7 @@ class buffer_node
 #endif
 
     //! Copy constructor
-    __TBB_NOINLINE_SYM buffer_node( const buffer_node& src )
-        : graph_node(src.my_graph), reservable_item_buffer<T, internals_allocator>(),
-          receiver<T>(), sender<T>(), forwarder_busy(false)
-    {
-        my_successors.set_owner(this);
-        my_aggregator.initialize_handler(handler_type(this));
-        fgt_node( CODEPTR(), FLOW_BUFFER_NODE, &this->my_graph,
-                                 static_cast<receiver<input_type> *>(this), static_cast<sender<output_type> *>(this) );
-    }
+    __TBB_NOINLINE_SYM buffer_node( const buffer_node& src ) : buffer_node(src.my_graph) {}
 
     //
     // message sender implementation
@@ -1416,8 +1393,6 @@ class buffer_node
         return my_graph;
     }
 
-    void reset_receiver(reset_flags /*f*/) override { }
-
 protected:
     void reset_node( reset_flags f) override {
         reservable_item_buffer<T, internals_allocator>::reset();
@@ -1889,11 +1864,6 @@ class limiter_node : public graph_node, public receiver< T >, public sender< T >
         }
     }
 
-    void forward() {
-        __TBB_ASSERT(false, "Should never be called");
-        return;
-    }
-
     graph_task* decrement_counter( long long delta ) {
         {
             spin_mutex::scoped_lock lock(my_mutex);
@@ -1908,9 +1878,6 @@ class limiter_node : public graph_node, public receiver< T >, public sender< T >
     }
 
     void initialize() {
-        my_predecessors.set_owner(this);
-        my_successors.set_owner(this);
-        decrement.set_owner(this);
         fgt_node(
             CODEPTR(), FLOW_LIMITER_NODE, &this->my_graph,
             static_cast<receiver<input_type> *>(this), static_cast<receiver<DecrementType> *>(&decrement),
@@ -1923,7 +1890,8 @@ class limiter_node : public graph_node, public receiver< T >, public sender< T >
 
     //! Constructor
     limiter_node(graph &g, size_t threshold)
-        : graph_node(g), my_threshold(threshold), my_count(0), my_tries(0), decrement()
+        : graph_node(g), my_threshold(threshold), my_count(0), my_tries(0), my_predecessors(this)
+        , my_successors(this), decrement(this)
     {
         initialize();
     }
@@ -1937,12 +1905,7 @@ class limiter_node : public graph_node, public receiver< T >, public sender< T >
 #endif
 
     //! Copy constructor
-    limiter_node( const limiter_node& src ) :
-        graph_node(src.my_graph), receiver<T>(), sender<T>(),
-        my_threshold(src.my_threshold), my_count(0), my_tries(0), decrement()
-    {
-        initialize();
-    }
+    limiter_node( const limiter_node& src ) : limiter_node(src.my_graph, src.my_threshold) {}
 
     //! Replace the current successor with this new successor
     bool register_successor( successor_type &r ) override {
@@ -2028,10 +1991,6 @@ class limiter_node : public graph_node, public receiver< T >, public sender< T >
 
     graph& graph_reference() const override { return my_graph; }
 
-    void reset_receiver(reset_flags /*f*/) override {
-        __TBB_ASSERT(false,NULL);  // should never be called
-    }
-
     void reset_node( reset_flags f) override {
         my_count = 0;
         if(f & rf_clear_edges) {
@@ -2118,15 +2077,6 @@ class join_node<OutputTuple, key_matching<K, KHash> > : public unfolded_join_nod
 
 #if __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING
     join_node(graph &g) : unfolded_type(g) {}
-
-#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
-    template <typename... Args>
-    join_node(const node_set<Args...>& nodes, key_matching<K, KHash> = key_matching<K, KHash>())
-        : join_node(nodes.graph_reference()) {
-        make_edges_in_order(nodes, *this);
-    }
-#endif
-
 #endif  /* __TBB_PREVIEW_MESSAGE_BASED_KEY_MATCHING */
 
     template<typename __TBB_B0, typename __TBB_B1>
@@ -2768,25 +2718,13 @@ class async_node
     typedef Input input_type;
     typedef Output output_type;
     typedef receiver<input_type> receiver_type;
-    typedef typename receiver_type::predecessor_type predecessor_type;
-    typedef typename sender<output_type>::successor_type successor_type;
+    typedef receiver<output_type> successor_type;
+    typedef sender<input_type> predecessor_type;
     typedef receiver_gateway<output_type> gateway_type;
     typedef async_body_base<gateway_type> async_body_base_type;
     typedef typename base_type::output_ports_type output_ports_type;
 
 private:
-    struct try_put_functor {
-        typedef multifunction_output<Output> output_port_type;
-        output_port_type *port;
-        // TODO: pass value by copy since we do not want to block asynchronous thread.
-        const Output *value;
-        bool result;
-        try_put_functor(output_port_type &p, const Output &v) : port(&p), value(&v), result(false) { }
-        void operator()() {
-            result = port->try_put(*value);
-        }
-    };
-
     class receiver_gateway_impl: public receiver_gateway<Output> {
     public:
         receiver_gateway_impl(async_node* node): my_node(node) {}
@@ -2880,13 +2818,15 @@ class async_node
     // Define sender< Output >
 
     //! Add a new successor to this node
-    bool register_successor( successor_type &r ) override {
-        return output_port<0>(*this).register_successor(r);
+    bool register_successor(successor_type&) override {
+        __TBB_ASSERT(false, "Successors must be registered only via ports");
+        return false;
     }
 
     //! Removes a successor from this node
-    bool remove_successor( successor_type &r ) override {
-        return output_port<0>(*this).remove_successor(r);
+    bool remove_successor(successor_type&) override {
+        __TBB_ASSERT(false, "Successors must be removed only via ports");
+        return false;
     }
 
     template<typename Body>
@@ -2915,10 +2855,11 @@ class overwrite_node : public graph_node, public receiver<T>, public sender<T> {
     typedef typename receiver<input_type>::predecessor_type predecessor_type;
     typedef typename sender<output_type>::successor_type successor_type;
 
-    __TBB_NOINLINE_SYM explicit overwrite_node(graph &g) : graph_node(g), my_buffer_is_valid(false) {
-        my_successors.set_owner( this );
+    __TBB_NOINLINE_SYM explicit overwrite_node(graph &g)
+        : graph_node(g), my_successors(this), my_buffer_is_valid(false)
+    {
         fgt_node( CODEPTR(), FLOW_OVERWRITE_NODE, &this->my_graph,
-                                 static_cast<receiver<input_type> *>(this), static_cast<sender<output_type> *>(this) );
+                  static_cast<receiver<input_type> *>(this), static_cast<sender<output_type> *>(this) );
     }
 
 #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
@@ -2929,17 +2870,11 @@ class overwrite_node : public graph_node, public receiver<T>, public sender<T> {
 #endif
 
     //! Copy constructor; doesn't take anything from src; default won't work
-    __TBB_NOINLINE_SYM overwrite_node( const overwrite_node& src ) :
-        graph_node(src.my_graph), receiver<T>(), sender<T>(), my_buffer_is_valid(false)
-    {
-        my_successors.set_owner( this );
-        fgt_node( CODEPTR(), FLOW_OVERWRITE_NODE, &this->my_graph,
-                                 static_cast<receiver<input_type> *>(this), static_cast<sender<output_type> *>(this) );
-    }
+    __TBB_NOINLINE_SYM overwrite_node( const overwrite_node& src ) : overwrite_node(src.my_graph) {}
 
     ~overwrite_node() {}
 
-   bool register_successor( successor_type &s ) override {
+    bool register_successor( successor_type &s ) override {
         spin_mutex::scoped_lock l( my_mutex );
         if (my_buffer_is_valid && is_graph_active( my_graph )) {
             // We have a valid value that must be forwarded immediately.
@@ -3048,7 +2983,6 @@ class overwrite_node : public graph_node, public receiver<T>, public sender<T> {
     broadcast_cache< input_type, null_rw_mutex > my_successors;
     input_type my_buffer;
     bool my_buffer_is_valid;
-    void reset_receiver(reset_flags /*f*/) override {}
 
     void reset_node( reset_flags f) override {
         my_buffer_is_valid = false;
diff --git a/include/tbb/memory_pool.h b/include/tbb/memory_pool.h
index 7c3e57c99e..6d47d5ec1d 100644
--- a/include/tbb/memory_pool.h
+++ b/include/tbb/memory_pool.h
@@ -213,8 +213,15 @@ void *memory_pool<Alloc>::allocate_request(intptr_t pool_id, size_t & bytes) {
     const size_t unit_size = sizeof(typename Alloc::value_type);
     __TBBMALLOC_ASSERT( 0 == bytes%unit_size, NULL);
     void *ptr;
-    __TBB_TRY { ptr = self.my_alloc.allocate( bytes/unit_size ); }
-    __TBB_CATCH(...) { return 0; }
+#if TBB_USE_EXCEPTIONS
+    try {
+#endif
+        ptr = self.my_alloc.allocate( bytes/unit_size );
+#if TBB_USE_EXCEPTIONS
+    } catch(...) {
+        return 0;
+    }
+#endif
     return ptr;
 }
 #if __TBB_MSVC_UNREACHABLE_CODE_IGNORED
diff --git a/include/tbb/parallel_for.h b/include/tbb/parallel_for.h
index 9a8301986a..1a294327bf 100644
--- a/include/tbb/parallel_for.h
+++ b/include/tbb/parallel_for.h
@@ -58,7 +58,7 @@ struct start_for : public task {
     //! Splitting constructor used to generate children.
     /** parent_ becomes left child.  Newly constructed object is right child. */
     start_for( start_for& parent_, typename Partitioner::split_type& split_obj, small_object_allocator& alloc ) :
-        my_range(parent_.my_range, split_obj),
+        my_range(parent_.my_range, get_range_split_object<Range>(split_obj)),
         my_body(parent_.my_body),
         my_partition(parent_.my_partition, split_obj),
         my_allocator(alloc) {}
diff --git a/include/tbb/parallel_reduce.h b/include/tbb/parallel_reduce.h
index 526d7952f7..25a9a8d2a0 100644
--- a/include/tbb/parallel_reduce.h
+++ b/include/tbb/parallel_reduce.h
@@ -83,7 +83,7 @@ struct start_reduce : public task {
     //! Splitting constructor used to generate children.
     /** parent_ becomes left child. Newly constructed object is right child. */
     start_reduce( start_reduce& parent_, typename Partitioner::split_type& split_obj, small_object_allocator& alloc ) :
-        my_range(parent_.my_range, split_obj),
+        my_range(parent_.my_range, get_range_split_object<Range>(split_obj)),
         my_body(parent_.my_body),
         my_partition(parent_.my_partition, split_obj),
         my_allocator(alloc),
@@ -238,21 +238,10 @@ struct start_deterministic_reduce : public task {
     /** parent_ becomes left child.  Newly constructed object is right child. */
     start_deterministic_reduce( start_deterministic_reduce& parent_, typename Partitioner::split_type& split_obj, Body& body,
                                 small_object_allocator& alloc ) :
-        my_range(parent_.my_range, split_obj),
+        my_range(parent_.my_range, get_range_split_object<Range>(split_obj)),
         my_body(body),
         my_partition(parent_.my_partition, split_obj),
         my_allocator(alloc) {}
-    //! Construct right child from the given range as response to the demand.
-    /** parent_ remains left child.  Newly constructed object is right child. */
-    start_deterministic_reduce( start_deterministic_reduce& parent_, const Range& r, depth_t d, Body& body,
-                                small_object_allocator& alloc ) :
-        my_range(r),
-        my_body(body),
-        my_partition(parent_.my_partition, split()),
-        my_allocator(alloc)
-    {
-        my_partition.align_depth( d );
-    }
     static void run(const Range& range, Body& body, Partitioner& partitioner, task_group_context& context) {
         if ( !range.empty() ) {
             wait_node wn;
@@ -274,16 +263,10 @@ struct start_deterministic_reduce : public task {
     void run_body( Range &r ) {
         my_body( r );
     }
-
     //! Spawn right task, serves as callback for partitioner
     void offer_work(typename Partitioner::split_type& split_obj, execution_data& ed) {
         offer_work_impl(ed, *this, split_obj);
     }
-    //! Spawn right task, serves as callback for partitioner
-    void offer_work(const Range& r, depth_t d, execution_data& ed) {
-        offer_work_impl(ed, *this, r, d);
-    }
-
 private:
     template <typename... Args>
     void offer_work_impl(execution_data& ed, Args&&... args) {
diff --git a/include/tbb/partitioner.h b/include/tbb/partitioner.h
index b625398a24..10905106bf 100644
--- a/include/tbb/partitioner.h
+++ b/include/tbb/partitioner.h
@@ -133,7 +133,6 @@ struct tree_node : public node {
     small_object_allocator m_allocator;
     std::atomic<bool> m_child_stolen{false};
 
-    tree_node(small_object_allocator& alloc) : m_allocator{alloc} {}
     tree_node(node* parent, int ref_count, small_object_allocator& alloc)
         : node{parent, ref_count}
         , m_allocator{alloc} {}
@@ -260,11 +259,6 @@ struct partition_type_base {
     void note_affinity( slot_id ) {}
     template <typename Task>
     bool check_being_stolen(Task&, const execution_data&) { return false; } // part of old should_execute_range()
-    template <typename Task>
-    bool check_for_demand(Task& ) { return false; }
-    bool is_divisible() { return true; } // part of old should_execute_range()
-    depth_t max_depth() { return 0; }
-    void align_depth(depth_t) { }
     template <typename Range> split_type get_split() { return split(); }
     Partition& self() { return *static_cast<Partition*>(this); } // CRTP helper
 
@@ -532,8 +526,6 @@ class static_partition_type : public linear_affinity_mode<static_partition_type>
     typedef detail::proportional_split split_type;
     static_partition_type( const static_partitioner& )
         : linear_affinity_mode<static_partition_type>() {}
-    static_partition_type( static_partition_type& p, split )
-        : linear_affinity_mode<static_partition_type>(p, split()) {}
     static_partition_type( static_partition_type& p, const proportional_split& split_obj )
         : linear_affinity_mode<static_partition_type>(p, split_obj) {}
 };
@@ -597,7 +589,6 @@ class simple_partitioner {
     public:
         bool should_execute_range(const execution_data& ) {return false;}
         partition_type( const simple_partitioner& ) {}
-        partition_type( const partition_type& ) {}
         partition_type( const partition_type&, split ) {}
     };
 };
@@ -632,8 +623,6 @@ class auto_partitioner {
         }
         partition_type( const auto_partitioner& )
             : num_chunks(get_initial_auto_partitioner_divisor()*__TBB_INITIAL_CHUNKS/4) {}
-        partition_type( const partition_type& )
-            : num_chunks(get_initial_auto_partitioner_divisor()*__TBB_INITIAL_CHUNKS/4) {}
         partition_type( partition_type& pt, split ) {
             num_chunks = pt.num_chunks = (pt.num_chunks+1u) / 2u;
         }
diff --git a/include/tbb/profiling.h b/include/tbb/profiling.h
index 7367cf679d..eb8217fef7 100644
--- a/include/tbb/profiling.h
+++ b/include/tbb/profiling.h
@@ -126,7 +126,7 @@ namespace d1 {
     }
 #else
     inline void call_itt_task_notify(d1::notify_type, void *) {}
-#endif // TBB_USE_PROFILING_TOOLS 
+#endif // TBB_USE_PROFILING_TOOLS
 
     inline void call_itt_notify(d1::notify_type t, void *ptr) {
         r1::call_itt_notify((int)t, ptr);
@@ -192,46 +192,8 @@ namespace d1 {
     inline void call_itt_notify(notify_type /*t*/, void* /*ptr*/) {}
 
     inline void call_itt_task_notify(notify_type /*t*/, void* /*ptr*/) {}
-
-    inline void itt_make_task_group( itt_domain_enum /*domain*/, void* /*group*/, unsigned long long /*group_extra*/,
-                                        void* /*parent*/, unsigned long long /*parent_extra*/, string_resource_index /*name_index*/ ) {}
-
-    inline void itt_metadata_str_add( itt_domain_enum /*domain*/, void* /*addr*/, unsigned long long /*addr_extra*/,
-                                        string_resource_index /*key*/, const char* /*value*/ ) {}
-
-    inline void register_node_addr( itt_domain_enum /*domain*/, void* /*addr*/, unsigned long long /*addr_extra*/, string_resource_index /*key*/, void* /*value*/ ) {}
-
-    inline void itt_relation_add( itt_domain_enum /*domain*/, void* /*addr0*/, unsigned long long /*addr0_extra*/,
-                                    itt_relation /*relation*/, void* /*addr1*/, unsigned long long /*addr1_extra*/ ) {}
-
-    inline void itt_task_begin( itt_domain_enum /*domain*/, void* /*task*/, unsigned long long /*task_extra*/,
-                                void* /*parent*/, unsigned long long /*parent_extra*/, string_resource_index /*name_index*/ ) {}
-
-    inline void itt_task_end( itt_domain_enum /*domain*/ ) {}
-
-    inline void itt_region_begin( itt_domain_enum /*domain*/, void* /*region*/, unsigned long long /*region_extra*/,
-                                    void* /*parent*/, unsigned long long /*parent_extra*/, string_resource_index /*name_index*/ ) {}
-
-    inline void itt_region_end( itt_domain_enum /*domain*/, void* /*region*/, unsigned long long /*region_extra*/ ) {}
-
 #endif // TBB_USE_PROFILING_TOOLS
 
-    template <typename T>
-    inline void store_with_release_itt(std::atomic<T>& dst, T src) {
-        call_itt_task_notify(releasing, &dst);
-        dst.store(src, std::memory_order_release);
-    }
-
-    template <typename T>
-    inline T load_with_acquire_itt(const std::atomic<T>& src) {
-        call_itt_task_notify(acquired, &src);
-        return src.load(std::memory_order_acquire);
-    }
-} // namespace d1
-} // namespace detail
-
-namespace detail {
-namespace d1 {
 #if TBB_USE_PROFILING_TOOLS && !(TBB_USE_PROFILING_TOOLS == 2)
 class event {
 /** This class supports user event traces through itt.
diff --git a/include/tbb/scalable_allocator.h b/include/tbb/scalable_allocator.h
index 6a8c69972e..2f36117800 100644
--- a/include/tbb/scalable_allocator.h
+++ b/include/tbb/scalable_allocator.h
@@ -191,19 +191,6 @@ std::size_t pool_msize(MemoryPool *memPool, void *object);
 
 } // namespace rml
 
-#if TBB_USE_EXCEPTIONS
-    #define __TBB_TRY try
-    #define __TBB_CATCH(e) catch(e)
-    #define __TBB_THROW(e) throw e
-    #define __TBB_RETHROW() throw
-#else /* !TBB_USE_EXCEPTIONS */
-    inline bool __TBB_false() { return false; }
-    #define __TBB_TRY
-    #define __TBB_CATCH(e) if ( __TBB_false() )
-    #define __TBB_THROW(e) tbb::detail::suppress_unused_warning(e)
-    #define __TBB_RETHROW() ((void)0)
-#endif /* !TBB_USE_EXCEPTIONS */
-
 namespace tbb {
 namespace detail {
 namespace d1 {
@@ -211,7 +198,11 @@ namespace d1 {
 // keep throw in a separate function to prevent code bloat
 template<typename E>
 void throw_exception(const E &e) {
-    __TBB_THROW(e);
+#if TBB_USE_EXCEPTIONS
+    throw e;
+#else
+    suppress_unused_warning(e);
+#endif
 }
 
 template<typename T>
diff --git a/include/tbb/task_arena.h b/include/tbb/task_arena.h
index 5fedf9078d..35c370b133 100644
--- a/include/tbb/task_arena.h
+++ b/include/tbb/task_arena.h
@@ -131,44 +131,24 @@ class task_arena_base {
 #if __TBB_NUMA_SUPPORT
     //! The NUMA node index to which the arena will be attached
     numa_node_id my_numa_id;
-
-    // Do not access my_numa_id without the following runtime check.
-    // Despite my_numa_id is accessible, it does not exist in task_arena_base on user side
-    // if TBB_PREVIEW_NUMA_SUPPORT macro is not defined by the user. To be sure that
-    // my_numa_id exists in task_arena_base layout we check the traits.
-    // TODO: Consider increasing interface version for task_arena_base instead of this runtime check.
-    numa_node_id numa_id() {
-        return (my_version_and_traits & numa_support_flag) == numa_support_flag ? my_numa_id : automatic;
-    }
 #endif
 
-    enum {
-        default_flags = 0
-#if __TBB_NUMA_SUPPORT
-        , numa_support_flag = 1
-#endif
-    };
+    enum { default_flags = 0 };
 
     task_arena_base(int max_concurrency, unsigned reserved_for_masters, priority a_priority)
         :
-#if __TBB_NUMA_SUPPORT
-        my_version_and_traits(default_flags | numa_support_flag)
-#else
         my_version_and_traits(default_flags)
-#endif
         , my_initialization_state(do_once_state::uninitialized)
         , my_arena(nullptr)
         , my_max_concurrency(max_concurrency)
         , my_master_slots(reserved_for_masters)
         , my_priority(a_priority)
-#if __TBB_NUMA_SUPPORT
         , my_numa_id(automatic)
-#endif
         {}
 
 #if __TBB_NUMA_SUPPORT
     task_arena_base(const constraints& constraints_, unsigned reserved_for_masters, priority a_priority)
-        : my_version_and_traits(default_flags | numa_support_flag)
+        : my_version_and_traits(default_flags)
         , my_initialization_state(do_once_state::uninitialized)
         , my_arena(nullptr)
         , my_max_concurrency(constraints_.max_concurrency)
diff --git a/include/tbb/task_group.h b/include/tbb/task_group.h
index 14ec4c365a..70a6d6dd9f 100644
--- a/include/tbb/task_group.h
+++ b/include/tbb/task_group.h
@@ -360,13 +360,15 @@ class task_group_base : no_copy {
     task_group_status internal_run_and_wait(const F& f) {
         function_stack_task<F> t{ f, m_wait_ctx };
         m_wait_ctx.reserve();
+        bool cancellation_status = false;
         try_call([&] {
             execute_and_wait(t, m_context, m_wait_ctx, m_context);
         }).on_completion([&] {
             // TODO: the reset method is not thread-safe. Ensure the correct behavior.
+            cancellation_status = m_context.is_group_execution_cancelled();
             m_context.reset();
         });
-        return m_context.is_group_execution_cancelled() ? canceled : complete;
+        return cancellation_status ? canceled : complete;
     }
 
     template<typename F>
@@ -401,13 +403,15 @@ class task_group_base : no_copy {
     }
 
     task_group_status wait() {
+        bool cancellation_status = false;
         try_call([&] {
             d1::wait(m_wait_ctx, m_context);
         }).on_completion([&] {
             // TODO: the reset method is not thread-safe. Ensure the correct behavior.
+            cancellation_status = m_context.is_group_execution_cancelled();
             m_context.reset();
         });
-        return m_context.is_group_execution_cancelled() ? canceled : complete;
+        return cancellation_status ? canceled : complete;
     }
 
     bool is_canceling() {
diff --git a/include/tbb/version.h b/include/tbb/version.h
index b5bd318b9a..38c2607feb 100644
--- a/include/tbb/version.h
+++ b/include/tbb/version.h
@@ -26,12 +26,12 @@
 // "Patch" version for custom releases
 #define TBB_VERSION_PATCH 0
 // Suffix string
-#define __TBB_VERSION_SUFFIX "-beta09"
+#define __TBB_VERSION_SUFFIX "-beta10"
 // Full official version string
 #define TBB_VERSION_STRING __TBB_STRING(TBB_VERSION_MAJOR) "." __TBB_STRING(TBB_VERSION_MINOR) __TBB_VERSION_SUFFIX
 
 // Full interface version
-#define TBB_INTERFACE_VERSION 12003
+#define TBB_INTERFACE_VERSION 12004
 // Major interface version
 #define TBB_INTERFACE_VERSION_MAJOR (TBB_INTERFACE_VERSION/1000)
 // Minor interface version
diff --git a/python/rml/ipc_server.cpp b/python/rml/ipc_server.cpp
index 5728dc40bb..bf860484ce 100644
--- a/python/rml/ipc_server.cpp
+++ b/python/rml/ipc_server.cpp
@@ -772,7 +772,9 @@ void ipc_stopper::run() {
             if( my_state.load(std::memory_order_acquire)!=st_quit ) {
                 if( !my_server.stop_one() ) {
                     my_server.add_stop_thread();
-                    tbb::detail::r1::prolonged_pause();
+                    // Workaround for prolonged_pause.
+                    // Reconsider this for new updates.
+                    tbb::detail::r1::prolonged_pause_impl();
                 }
             }
         }
diff --git a/src/tbb/CMakeLists.txt b/src/tbb/CMakeLists.txt
index b814ee3ac7..b145667b27 100644
--- a/src/tbb/CMakeLists.txt
+++ b/src/tbb/CMakeLists.txt
@@ -70,10 +70,11 @@ target_compile_options(tbb
 
 # Avoid use of target_link_libraries here as it changes /DEF option to \DEF on Windows.
 set_target_properties(tbb PROPERTIES
-    LINK_FLAGS ${TBB_LINK_DEF_FILE_FLAG}${CMAKE_CURRENT_SOURCE_DIR}/def/${TBB_DEF_FILE_PREFIX}-tbb.def
     DEFINE_SYMBOL ""
     VERSION ${TBB_BINARY_VERSION}.${TBB_BINARY_MINOR_VERSION}
     SOVERSION ${TBB_BINARY_VERSION}
+    LINK_FLAGS ${TBB_LINK_DEF_FILE_FLAG}${CMAKE_CURRENT_SOURCE_DIR}/def/${TBB_DEF_FILE_PREFIX}-tbb.def
+    LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/def/${TBB_DEF_FILE_PREFIX}-tbb.def
 )
 
 # Prefer using target_link_options instead of target_link_libraries to specify link options because
diff --git a/src/tbb/allocator.cpp b/src/tbb/allocator.cpp
index 056faebaa1..438493c867 100644
--- a/src/tbb/allocator.cpp
+++ b/src/tbb/allocator.cpp
@@ -54,21 +54,15 @@ namespace r1 {
 //! Initialization routine used for first indirect call via allocate_handler.
 static void* initialize_allocate_handler(std::size_t size);
 
-//! Initialization routine used for first indirect call via deallocate_handler.
-static void  initialize_deallocate_handler(void* ptr);
-
 //! Handler for memory allocation
 static void* (*allocate_handler)(std::size_t size) = &initialize_allocate_handler;
 
 //! Handler for memory deallocation
-static void  (*deallocate_handler)(void* pointer) = &initialize_deallocate_handler;
+static void  (*deallocate_handler)(void* pointer) = nullptr;
 
 //! Initialization routine used for first indirect call via cache_aligned_allocate_handler.
 static void* initialize_cache_aligned_allocate_handler(std::size_t n, std::size_t alignment);
 
-//! Initialization routine used for first indirect call via cache_aligned_deallocate_handler.
-static void  initialize_cache_aligned_deallocate_handler(void* ptr);
-
 //! Allocates memory using standard malloc. It is used when scalable_allocator is not available
 static void* std_cache_aligned_allocate(std::size_t n, std::size_t alignment);
 
@@ -79,7 +73,7 @@ static void  std_cache_aligned_deallocate(void* p);
 static void* (*cache_aligned_allocate_handler)(std::size_t n, std::size_t alignment) = &initialize_cache_aligned_allocate_handler;
 
 //! Handler for padded memory deallocation
-static void (*cache_aligned_deallocate_handler)(void* p) = &initialize_cache_aligned_deallocate_handler;
+static void (*cache_aligned_deallocate_handler)(void* p) = nullptr;
 
 //! Table describing how to link the handlers.
 static const dynamic_link_descriptor MallocLinkTable[] = {
@@ -142,13 +136,6 @@ static void* initialize_allocate_handler(std::size_t size) {
     return (*allocate_handler)(size);
 }
 
-//! Executed on very first call through deallocate_handler
-static void initialize_deallocate_handler(void* ptr) {
-    initialize_cache_aligned_allocator();
-    __TBB_ASSERT(deallocate_handler != &initialize_deallocate_handler, NULL);
-    (*deallocate_handler)(ptr);
-}
-
 //! Executed on very first call through cache_aligned_allocate_handler
 static void* initialize_cache_aligned_allocate_handler(std::size_t bytes, std::size_t alignment) {
     initialize_cache_aligned_allocator();
@@ -156,13 +143,6 @@ static void* initialize_cache_aligned_allocate_handler(std::size_t bytes, std::s
     return (*cache_aligned_allocate_handler)(bytes, alignment);
 }
 
-//! Executed on very first call through cache_aligned_deallocate_handler
-static void initialize_cache_aligned_deallocate_handler(void* ptr) {
-    initialize_cache_aligned_allocator();
-    __TBB_ASSERT(cache_aligned_deallocate_handler != &initialize_cache_aligned_deallocate_handler, NULL);
-    (*cache_aligned_deallocate_handler)(ptr);
-}
-
 // TODO: use CPUID to find actual line size, though consider backward compatibility
 // nfs - no false sharing
 static constexpr std::size_t nfs_size = 128;
@@ -191,6 +171,7 @@ void* __TBB_EXPORTED_FUNC cache_aligned_allocate(std::size_t size) {
 }
 
 void __TBB_EXPORTED_FUNC cache_aligned_deallocate(void* p) {
+    __TBB_ASSERT(cache_aligned_deallocate_handler, "Initialization has not been yet.");
     (*cache_aligned_deallocate_handler)(p);
 }
 
@@ -231,6 +212,7 @@ void* __TBB_EXPORTED_FUNC allocate_memory(std::size_t size) {
 
 void __TBB_EXPORTED_FUNC deallocate_memory(void* p) {
     if (p) {
+        __TBB_ASSERT(deallocate_handler, "Initialization has not been yet.");
         (*deallocate_handler)(p);
     }
 }
@@ -240,7 +222,7 @@ bool __TBB_EXPORTED_FUNC is_tbbmalloc_used() {
         void* void_ptr = allocate_handler(1);
         deallocate_handler(void_ptr);
     }
-    __TBB_ASSERT(allocate_handler != &initialize_allocate_handler && deallocate_handler != &initialize_deallocate_handler, NULL);
+    __TBB_ASSERT(allocate_handler != &initialize_allocate_handler && deallocate_handler != nullptr, NULL);
     // Cast to void avoids type mismatch errors on some compilers (e.g. __IBMCPP__)
     __TBB_ASSERT((reinterpret_cast<void*>(allocate_handler) == reinterpret_cast<void*>(&std::malloc)) == (reinterpret_cast<void*>(deallocate_handler) == reinterpret_cast<void*>(&std::free)),
                   "Both shim pointers must refer to routines from the same package (either TBB or CRT)");
diff --git a/src/tbb/arena.cpp b/src/tbb/arena.cpp
index 68a4c9c6f1..49e381224c 100644
--- a/src/tbb/arena.cpp
+++ b/src/tbb/arena.cpp
@@ -19,6 +19,7 @@
 #include "arena.h"
 #include "itt_notify.h"
 #include "semaphore.h"
+#include "waiters.h"
 #include "tbb/detail/_task.h"
 #include "tbb/tbb_allocator.h"
 
@@ -103,40 +104,6 @@ std::size_t arena::occupy_free_slot(thread_data& tls) {
     return index;
 }
 
-class outermost_worker_waiter {
-    arena& my_arena;
-    stealing_loop_backoff my_backoff;
-public:
-    outermost_worker_waiter(arena& a) : my_arena( a ), my_backoff( int(a.my_num_slots) ) {}
-
-    bool continue_execution(arena_slot& slot, d1::task*& t) const {
-        __TBB_ASSERT(t == nullptr, nullptr);
-        if (my_arena.is_recall_requested()) {
-            return false;
-        }
-        t = get_self_recall_task(slot);
-        return true;
-    }
-
-    void pause() {
-        if (my_backoff.pause()) {
-            my_arena.is_out_of_work();
-        }
-    }
-
-    void reset_wait() {
-        my_backoff.reset_wait();
-    }
-
-    d1::wait_context* wait_ctx() {
-        return nullptr;
-    }
-
-    static bool postpone_execution(d1::task&) {
-        return false;
-    }
-};
-
 std::uintptr_t arena::calculate_stealing_threshold() {
     stack_anchor_type anchor;
     return r1::calculate_stealing_threshold(reinterpret_cast<std::uintptr_t>(&anchor), my_market->worker_stack_size());
@@ -251,7 +218,7 @@ void arena::free_arena () {
     __TBB_ASSERT( is_alive(my_guard), NULL );
     __TBB_ASSERT( !my_references.load(std::memory_order_relaxed), "There are threads in the dying arena" );
     __TBB_ASSERT( !my_num_workers_requested && !my_num_workers_allotted, "Dying arena requests workers" );
-    __TBB_ASSERT( my_pool_state.load(std::memory_order_relaxed) == SNAPSHOT_EMPTY || !my_max_num_workers, 
+    __TBB_ASSERT( my_pool_state.load(std::memory_order_relaxed) == SNAPSHOT_EMPTY || !my_max_num_workers,
                   "Inconsistent state of a dying arena" );
 #if __TBB_ENQUEUE_ENFORCED_CONCURRENCY
     __TBB_ASSERT( !my_global_concurrency_mode, NULL );
@@ -447,7 +414,7 @@ void task_arena_impl::initialize(d1::task_arena_base& ta) {
     governor::one_time_init();
     if (ta.my_max_concurrency < 1) {
 #if __TBB_NUMA_SUPPORT
-        ta.my_max_concurrency = numa_default_concurrency(ta.numa_id());
+        ta.my_max_concurrency = numa_default_concurrency(ta.my_numa_id);
 #else /*__TBB_NUMA_SUPPORT*/
         ta.my_max_concurrency = (int)governor::default_num_threads();
 #endif /*__TBB_NUMA_SUPPORT*/
@@ -462,7 +429,7 @@ void task_arena_impl::initialize(d1::task_arena_base& ta) {
     market::global_market( /*is_public=*/false);
 #if __TBB_NUMA_SUPPORT
     ta.my_arena->my_numa_binding_observer = construct_binding_observer(
-        static_cast<d1::task_arena*>(&ta), ta.numa_id(), ta.my_arena->my_num_slots);
+        static_cast<d1::task_arena*>(&ta), ta.my_numa_id, ta.my_arena->my_num_slots);
 #endif /*__TBB_NUMA_SUPPORT*/
 }
 
@@ -685,14 +652,15 @@ void task_arena_impl::execute(d1::task_arena_base& ta, d1::delegate_base& d) {
     context_guard.set_ctx(ta.my_arena->my_default_ctx);
     nested_arena_context scope(*td, *ta.my_arena, index1);
 #if _WIN64
-    try_call([&] {
+    try {
 #endif
         d();
         __TBB_ASSERT(same_arena || governor::is_thread_data_set(td), nullptr);
 #if _WIN64
-    }).on_exception([&] {
+    } catch (...) {
         context_guard.restore_default();
-    });
+        throw;
+    }
 #endif
 }
 
@@ -701,8 +669,10 @@ void task_arena_impl::wait(d1::task_arena_base& ta) {
     thread_data* td = governor::get_thread_data();
     __TBB_ASSERT_EX(td, "Scheduler is not initialized");
     __TBB_ASSERT(td->my_arena != ta.my_arena || td->my_arena_index == 0, "internal_wait is not supported within a worker context" );
-    while (ta.my_arena->num_workers_active() || ta.my_arena->my_pool_state.load(std::memory_order_acquire) != arena::SNAPSHOT_EMPTY) {
-        yield();
+    if (ta.my_arena->my_max_num_workers != 0) {
+        while (ta.my_arena->num_workers_active() || ta.my_arena->my_pool_state.load(std::memory_order_acquire) != arena::SNAPSHOT_EMPTY) {
+            yield();
+        }
     }
 }
 
@@ -716,26 +686,32 @@ int task_arena_impl::max_concurrency(const d1::task_arena_base *ta) {
     if( a ) { // Get parameters from the arena
         __TBB_ASSERT( !ta || ta->my_max_concurrency==1, NULL );
         return a->my_num_reserved_slots + a->my_max_num_workers;
-    } else {
-        __TBB_ASSERT( !ta || ta->my_max_concurrency==d1::task_arena_base::automatic, NULL );
-        return int(governor::default_num_threads());
     }
+
+    if (ta && ta->my_max_concurrency == 1) {
+        return 1;
+    }
+
+    __TBB_ASSERT(!ta || ta->my_max_concurrency==d1::task_arena_base::automatic, NULL );
+    return int(governor::default_num_threads());
 }
 
 void isolate_within_arena(d1::delegate_base& d, std::intptr_t isolation) {
     // TODO: Decide what to do if the scheduler is not initialized. Is there a use case for it?
     thread_data* tls = governor::get_thread_data();
     assert_pointers_valid(tls, tls->my_task_dispatcher);
-    isolation_type previous_isolation = tls->my_task_dispatcher->m_execute_data_ext.isolation;
+    task_dispatcher* dispatcher = tls->my_task_dispatcher;
+    isolation_type previous_isolation = dispatcher->m_execute_data_ext.isolation;
     try_call([&] {
         // We temporarily change the isolation tag of the currently running task. It will be restored in the destructor of the guard.
         isolation_type current_isolation = isolation ? isolation : reinterpret_cast<isolation_type>(&d);
         // Save the current isolation value and set new one
-        previous_isolation = tls->my_task_dispatcher->set_isolation(current_isolation);
+        previous_isolation = dispatcher->set_isolation(current_isolation);
         // Isolation within this callable
         d();
     }).on_completion([&] {
-        tls->my_task_dispatcher->set_isolation(previous_isolation);
+        __TBB_ASSERT(governor::get_thread_data()->my_task_dispatcher == dispatcher, NULL);
+        dispatcher->set_isolation(previous_isolation);
     });
 }
 
diff --git a/src/tbb/arena.h b/src/tbb/arena.h
index f1165036e7..b43556de3e 100644
--- a/src/tbb/arena.h
+++ b/src/tbb/arena.h
@@ -138,7 +138,7 @@ struct stack_anchor_type {
     Intrusive list node base class is used by market to form a list of arenas. **/
 struct arena_base : padded<intrusive_list_node> {
     //! The number of workers that have been marked out by the resource manager to service the arena.
-    unsigned my_num_workers_allotted;   // heavy use in stealing loop
+    std::atomic<unsigned> my_num_workers_allotted;   // heavy use in stealing loop
 
     //! Reference counter for the arena.
     /** Worker and master references are counted separately: first several bits are for references
@@ -171,12 +171,19 @@ struct arena_base : padded<intrusive_list_node> {
     //! The number of workers requested by the master thread owning the arena.
     unsigned my_max_num_workers;
 
-    //! The number of workers that are currently requested from the resource manager.
+    //! The total number of workers that are requested from the resource manager.
+    int my_total_num_workers_requested;
+
+    //! The number of workers that are really requested from the resource manager.
+    //! Possible values are in [0, my_max_num_workers]
     int my_num_workers_requested;
 
     //! The index in the array of per priority lists of arenas this object is in.
     /*const*/ unsigned my_priority_level;
 
+    //! The max priority level of arena in market.
+    std::atomic<bool> my_is_top_priority{false};
+
     //! Current task pool state and estimate of available tasks amount.
     /** The estimate is either 0 (SNAPSHOT_EMPTY) or infinity (SNAPSHOT_FULL).
         Special state is "busy" (any other unsigned value).
@@ -217,6 +224,9 @@ struct arena_base : padded<intrusive_list_node> {
     std::atomic<bool> my_global_concurrency_mode;
 #endif /* __TBB_ENQUEUE_ENFORCED_CONCURRENCY */
 
+    //! Waiting object for external and coroutine waiters.
+    concurrent_monitor my_sleep_monitors;
+
     //! Waiting object for master threads that cannot join the arena.
     concurrent_monitor my_exit_monitors;
 
@@ -277,7 +287,7 @@ class arena: public padded<arena_base>
 
     //! Reference increment values for externals and workers
     static const unsigned ref_external = 1;
-    static const unsigned ref_worker   = 1<<ref_external_bits;
+    static const unsigned ref_worker   = 1 << ref_external_bits;
 
     //! No tasks to steal or snapshot is being taken.
     static bool is_busy_or_empty( pool_state_t s ) { return s < SNAPSHOT_FULL; }
@@ -289,7 +299,7 @@ class arena: public padded<arena_base>
 
     //! Check if the recall is requested by the market.
     bool is_recall_requested() const {
-        return num_workers_active() > my_num_workers_allotted;
+        return num_workers_active() > my_num_workers_allotted.load(std::memory_order_relaxed);
     }
 
     //! If necessary, raise a flag that there is new job in arena.
@@ -394,6 +404,7 @@ inline void arena::on_thread_leaving ( ) {
     // state (including the fact if it is alive) under the lock.
     //
     std::uintptr_t aba_epoch = my_aba_epoch;
+    unsigned priority_level = my_priority_level;
     market* m = my_market;
     __TBB_ASSERT(my_references.load(std::memory_order_relaxed) >= ref_param, "broken arena reference counter");
 #if __TBB_ENQUEUE_ENFORCED_CONCURRENCY
@@ -415,15 +426,11 @@ inline void arena::on_thread_leaving ( ) {
     }
 #endif
     if ( (my_references -= ref_param ) == 0 )
-        m->try_destroy_arena( this, aba_epoch );
+        m->try_destroy_arena( this, aba_epoch, priority_level );
 }
 
 template<arena::new_work_type work_type>
 void arena::advertise_new_work() {
-    if (my_max_num_workers == 0 && my_num_reserved_slots > 1) {
-        // No workers are available. It is the worker-less arena.
-        return;
-    }
     if( work_type == work_enqueued ) {
 #if __TBB_ENQUEUE_ENFORCED_CONCURRENCY
         if ( my_market->my_num_workers_soft_limit.load(std::memory_order_acquire) == 0 &&
@@ -437,6 +444,9 @@ void arena::advertise_new_work() {
             my_pool_state.store(SNAPSHOT_FULL, std::memory_order_release);
             my_max_num_workers = 1;
             my_market->adjust_demand(*this, my_max_num_workers);
+
+            // Notify all sleeping threads that work has appeared in the arena.
+            my_sleep_monitors.notify_all();
             return;
         }
 #endif /* __TBB_ENQUEUE_ENFORCED_CONCURRENCY */
@@ -447,6 +457,7 @@ void arena::advertise_new_work() {
     else if( work_type == wakeup ) {
         atomic_fence(std::memory_order_seq_cst);
     }
+
     // Double-check idiom that, in case of spawning, is deliberately sloppy about memory fences.
     // Technically, to avoid missed wakeups, there should be a full memory fence between the point we
     // released the task pool (i.e. spawned task) and read the arena's state.  However, adding such a
@@ -491,6 +502,9 @@ void arena::advertise_new_work() {
 #endif /* __TBB_ENQUEUE_ENFORCED_CONCURRENCY */
             // TODO: investigate adjusting of arena's demand by a single worker.
             my_market->adjust_demand( *this, my_max_num_workers );
+
+            // Notify all sleeping threads that work has appeared in the arena.
+            my_sleep_monitors.notify_all();
         }
     }
 }
diff --git a/src/tbb/arena_slot.h b/src/tbb/arena_slot.h
index cd42ddb4ec..ca74a92e0e 100644
--- a/src/tbb/arena_slot.h
+++ b/src/tbb/arena_slot.h
@@ -87,6 +87,7 @@ struct alignas(max_nfs_size) arena_slot_private_state {
 
 class arena_slot : private arena_slot_shared_state, private arena_slot_private_state {
     friend class arena;
+    friend class outermost_worker_waiter;
     friend class task_dispatcher;
     friend class thread_data;
     friend class nested_arena_context;
@@ -350,6 +351,7 @@ class arena_slot : private arena_slot_shared_state, private arena_slot_private_s
         task_pool.store(victim_task_pool, std::memory_order_release);
     }
 
+#if TBB_USE_ASSERT
     bool is_local_task_pool_quiescent() const {
         d1::task** tp = task_pool.load(std::memory_order_relaxed);
         return tp == EmptyTaskPool || tp == LockedTaskPool;
@@ -364,6 +366,7 @@ class arena_slot : private arena_slot_shared_state, private arena_slot_private_s
         __TBB_ASSERT(is_local_task_pool_quiescent(), "Task pool is not quiescent");
         return head.load(std::memory_order_relaxed) == 0 && tail.load(std::memory_order_relaxed) == 0;
     }
+#endif // TBB_USE_ASSERT
 
     //! Leave the task pool
     /** Leaving task pool automatically releases the task pool if it is locked. **/
diff --git a/src/tbb/concurrent_monitor.cpp b/src/tbb/concurrent_monitor.cpp
index b02fb05022..b3675c1846 100644
--- a/src/tbb/concurrent_monitor.cpp
+++ b/src/tbb/concurrent_monitor.cpp
@@ -82,6 +82,29 @@ void concurrent_monitor::notify_one_relaxed() {
         to_thread_context(n)->semaphore().V();
 }
 
+void concurrent_monitor::notify_all_relaxed() {
+    if( waitset_ec.empty() )
+        return;
+    waitset_t temp;
+    const waitset_node_t* end;
+    {
+        tbb::spin_mutex::scoped_lock l( mutex_ec );
+        epoch.store( epoch.load( std::memory_order_relaxed ) + 1, std::memory_order_relaxed );
+        waitset_ec.flush_to( temp );
+        end = temp.end();
+        for( waitset_node_t* n=temp.front(); n!=end; n=n->next )
+            to_thread_context(n)->in_waitset = false;
+    }
+    waitset_node_t* nxt;
+    for( waitset_node_t* n=temp.front(); n!=end; n=nxt ) {
+        nxt = n->next;
+        to_thread_context(n)->semaphore().V();
+    }
+#if TBB_USE_ASSERT
+    temp.clear();
+#endif
+}
+
 void concurrent_monitor::abort_all_relaxed() {
     if( waitset_ec.empty() )
         return;
diff --git a/src/tbb/concurrent_monitor.h b/src/tbb/concurrent_monitor.h
index 363fb0716e..241ee06666 100644
--- a/src/tbb/concurrent_monitor.h
+++ b/src/tbb/concurrent_monitor.h
@@ -48,7 +48,6 @@ class circular_doubly_linked_list_with_sentinel : no_copy {
     inline bool    empty() const {return size()==0;}
     inline node_t* front() const {return head.next;}
     inline node_t* last()  const {return head.prev;}
-    inline node_t* begin() const {return front();}
     inline const node_t* end() const {return &head;}
 
     //! add to the back of the list
@@ -157,6 +156,12 @@ class concurrent_monitor : no_copy {
     //! Notify one thread about the event. Relaxed version.
     void notify_one_relaxed();
 
+    //! Notify all waiting threads of the event
+    void notify_all() {atomic_fence( std::memory_order_seq_cst ); notify_all_relaxed();}
+
+    // ! Notify all waiting threads of the event; Relaxed version
+    void notify_all_relaxed();
+
     //! Notify waiting threads of the event that satisfies the given predicate
     template<typename P> void notify( const P& predicate ) {
         atomic_fence( std::memory_order_seq_cst );
@@ -210,35 +215,6 @@ void concurrent_monitor::notify_relaxed( const P& predicate ) {
 #endif
 }
 
-// Additional possible methods that are not required right now
-// //! Notify all waiting threads of the event
-// void notify_all() {atomic_fence( std::memory_order_seq_cst ); notify_all_relaxed();}
-
-// Additional possible methods that are not required right now
-//! Notify all waiting threads of the event; Relaxed version
-// void concurrent_monitor::notify_all_relaxed() {
-//     if( waitset_ec.empty() )
-//         return;
-//     waitset_t temp;
-//     const waitset_node_t* end;
-//     {
-//         tbb::spin_mutex::scoped_lock l( mutex_ec );
-//         epoch.store( epoch.load( std::memory_order_relaxed ) + 1, std::memory_order_relaxed );
-//         waitset_ec.flush_to( temp );
-//         end = temp.end();
-//         for( waitset_node_t* n=temp.front(); n!=end; n=n->next )
-//             to_thread_context(n)->in_waitset = false;
-//     }
-//     waitset_node_t* nxt;
-//     for( waitset_node_t* n=temp.front(); n!=end; n=nxt ) {
-//         nxt = n->next;
-//         to_thread_context(n)->semaphore().V();
-//     }
-// #if TBB_USE_ASSERT
-//     temp.clear();
-// #endif
-// }
-
 // Additional possible methods that are not required right now
 //! Wait for a condition to be satisfied with waiting-on context
 // template<typename WaitUntil, typename Context>
diff --git a/src/tbb/def/lin32-tbb.def b/src/tbb/def/lin32-tbb.def
index efe60e0597..2218e8eb28 100644
--- a/src/tbb/def/lin32-tbb.def
+++ b/src/tbb/def/lin32-tbb.def
@@ -76,6 +76,7 @@ _ZN3tbb6detail2r19downgradeERNS0_2d112rtm_rw_mutex11scoped_lockE;
 _ZN3tbb6detail2r17suspendEPFvPvPNS1_18suspend_point_typeEES2_;
 _ZN3tbb6detail2r16resumeEPNS1_18suspend_point_typeE;
 _ZN3tbb6detail2r121current_suspend_pointEv;
+_ZN3tbb6detail2r114notify_waitersERNS0_2d112wait_contextE;
 
 /* Task dispatcher (task_dispatcher.cpp) */
 _ZN3tbb6detail2r114execution_slotEPKNS0_2d114execution_dataE;
diff --git a/src/tbb/def/lin64-tbb.def b/src/tbb/def/lin64-tbb.def
index a2fdf49d51..f7685094d9 100644
--- a/src/tbb/def/lin64-tbb.def
+++ b/src/tbb/def/lin64-tbb.def
@@ -76,6 +76,7 @@ _ZN3tbb6detail2r19downgradeERNS0_2d112rtm_rw_mutex11scoped_lockE;
 _ZN3tbb6detail2r17suspendEPFvPvPNS1_18suspend_point_typeEES2_;
 _ZN3tbb6detail2r16resumeEPNS1_18suspend_point_typeE;
 _ZN3tbb6detail2r121current_suspend_pointEv;
+_ZN3tbb6detail2r114notify_waitersERNS0_2d112wait_contextE;
 
 /* Task dispatcher (task_dispatcher.cpp) */
 _ZN3tbb6detail2r114execution_slotEPKNS0_2d114execution_dataE;
diff --git a/src/tbb/def/mac64-tbb.def b/src/tbb/def/mac64-tbb.def
index c907873684..78d1d64942 100644
--- a/src/tbb/def/mac64-tbb.def
+++ b/src/tbb/def/mac64-tbb.def
@@ -78,6 +78,7 @@ __ZN3tbb6detail2r19downgradeERNS0_2d112rtm_rw_mutex11scoped_lockE
 __ZN3tbb6detail2r17suspendEPFvPvPNS1_18suspend_point_typeEES2_
 __ZN3tbb6detail2r16resumeEPNS1_18suspend_point_typeE
 __ZN3tbb6detail2r121current_suspend_pointEv
+__ZN3tbb6detail2r114notify_waitersERNS0_2d112wait_contextE
 
 # Task dispatcher (task_dispatcher.cpp)
 __ZN3tbb6detail2r114execution_slotEPKNS0_2d114execution_dataE
diff --git a/src/tbb/def/win32-tbb.def b/src/tbb/def/win32-tbb.def
index 6d15ee64ba..b2c93a486b 100644
--- a/src/tbb/def/win32-tbb.def
+++ b/src/tbb/def/win32-tbb.def
@@ -70,6 +70,7 @@ EXPORTS
 ?current_suspend_point@r1@detail@tbb@@YAPAUsuspend_point_type@123@XZ
 ?resume@r1@detail@tbb@@YAXPAUsuspend_point_type@123@@Z
 ?suspend@r1@detail@tbb@@YAXP6AXPAXPAUsuspend_point_type@123@@Z0@Z
+?notify_waiters@r1@detail@tbb@@YAXAAVwait_context@d1@23@@Z
 
 ; Task dispatcher (task_dispatcher.cpp)
 ?spawn@r1@detail@tbb@@YAXAAVtask@d1@23@AAVtask_group_context@523@G@Z
diff --git a/src/tbb/def/win64-tbb.def b/src/tbb/def/win64-tbb.def
index 7c248f93ae..89f27b98c9 100644
--- a/src/tbb/def/win64-tbb.def
+++ b/src/tbb/def/win64-tbb.def
@@ -70,6 +70,7 @@ EXPORTS
 ?suspend@r1@detail@tbb@@YAXP6AXPEAXPEAUsuspend_point_type@123@@Z0@Z
 ?resume@r1@detail@tbb@@YAXPEAUsuspend_point_type@123@@Z
 ?current_suspend_point@r1@detail@tbb@@YAPEAUsuspend_point_type@123@XZ
+?notify_waiters@r1@detail@tbb@@YAXAEAVwait_context@d1@23@@Z
 
 ; Task dispatcher (task_dispatcher.cpp)
 ?spawn@r1@detail@tbb@@YAXAEAVtask@d1@23@AEAVtask_group_context@523@@Z
diff --git a/src/tbb/global_control.cpp b/src/tbb/global_control.cpp
index 7a94e971f4..b642dce370 100644
--- a/src/tbb/global_control.cpp
+++ b/src/tbb/global_control.cpp
@@ -235,7 +235,7 @@ struct global_control_impl {
         erase_if_present(c, gc);
         return c->my_list.empty();
     }
-
+#if TBB_USE_ASSERT
     static bool is_present(d1::global_control& gc) {
         __TBB_ASSERT_RELEASE(gc.my_param < global_control::parameter_max, NULL);
         control_storage* const c = controls[gc.my_param];
@@ -247,6 +247,7 @@ struct global_control_impl {
         }
         return false;
     }
+#endif // TBB_USE_ASSERT
 };
 
 void __TBB_EXPORTED_FUNC create(d1::global_control& gc) {
@@ -259,10 +260,11 @@ void __TBB_EXPORTED_FUNC destroy(d1::global_control& gc) {
 bool remove_and_check_if_empty(d1::global_control& gc) {
     return global_control_impl::remove_and_check_if_empty(gc);
 }
+#if TBB_USE_ASSERT
 bool is_present(d1::global_control& gc) {
     return global_control_impl::is_present(gc);
 }
-
+#endif // TBB_USE_ASSERT
 std::size_t __TBB_EXPORTED_FUNC global_control_active_value(int param) {
     __TBB_ASSERT_RELEASE(param < global_control::parameter_max, NULL);
     return controls[param]->active_value();
diff --git a/src/tbb/governor.cpp b/src/tbb/governor.cpp
index 54dfc071a1..df5708dc90 100644
--- a/src/tbb/governor.cpp
+++ b/src/tbb/governor.cpp
@@ -58,7 +58,7 @@ void governor::acquire_resources () {
 #endif
     if( status )
         handle_perror(status, "TBB failed to initialize task scheduler TLS\n");
-    is_speculation_enabled = cpu_has_speculation();
+    detect_cpu_features(cpu_features);
     is_rethrow_broken = gcc_rethrow_exception_broken();
 }
 
@@ -241,7 +241,7 @@ bool finalize_impl(d1::task_scheduler_handle& handle) {
         if (td) {
             task_dispatcher* task_disp = td->my_task_dispatcher;
             __TBB_ASSERT(task_disp, nullptr);
-            if (task_disp->m_properties.outermost && !td->my_is_worker) { // is not inside a tbb parallel region
+            if (task_disp->m_properties.outermost && !td->my_is_worker) { // is not inside a parallel region
                 governor::auto_terminate(td);
             }
         }
diff --git a/src/tbb/governor.h b/src/tbb/governor.h
index dc58630633..f0443244c1 100644
--- a/src/tbb/governor.h
+++ b/src/tbb/governor.h
@@ -65,7 +65,7 @@ class governor {
     static bool UsePrivateRML;
 
     // Flags for runtime-specific conditions
-    static bool is_speculation_enabled;
+    static cpu_features_type cpu_features;
     static bool is_rethrow_broken;
 
     //! Create key for thread-local storage and initialize RML.
@@ -136,7 +136,9 @@ class governor {
 
     static bool does_client_join_workers (const rml::tbb_client &client);
 
-    static bool speculation_enabled() { return is_speculation_enabled; }
+    static bool speculation_enabled() { return cpu_features.rtm_enabled; }
+
+    static bool wait_package_enabled() { return cpu_features.waitpkg_enabled; }
 
     static bool rethrow_exception_broken() { return is_rethrow_broken; }
 
diff --git a/src/tbb/intrusive_list.h b/src/tbb/intrusive_list.h
index ccaeec8840..1aebff8797 100644
--- a/src/tbb/intrusive_list.h
+++ b/src/tbb/intrusive_list.h
@@ -65,11 +65,6 @@ class intrusive_list_base {
 
         iterator_impl( pointer_type pos ) : my_pos(pos) {}
 
-        iterator_impl& operator=( const T& val ) {
-            my_pos = &node(val);
-            return *this;
-        }
-
         iterator_impl& operator++() {
             my_pos = my_pos->my_next_node;
             return *this;
diff --git a/src/tbb/main.cpp b/src/tbb/main.cpp
index e524475de6..1968b5c216 100644
--- a/src/tbb/main.cpp
+++ b/src/tbb/main.cpp
@@ -39,8 +39,8 @@ unsigned governor::DefaultNumberOfThreads;
 size_t governor::DefaultPageSize;
 rml::tbb_factory governor::theRMLServerFactory;
 bool governor::UsePrivateRML;
-bool governor::is_speculation_enabled;
 bool governor::is_rethrow_broken;
+cpu_features_type governor::cpu_features;
 
 //------------------------------------------------------------------------
 // market data
diff --git a/src/tbb/market.cpp b/src/tbb/market.cpp
index 8f44c29343..78757d2029 100644
--- a/src/tbb/market.cpp
+++ b/src/tbb/market.cpp
@@ -313,7 +313,7 @@ void market::detach_arena ( arena& a ) {
         ++my_arenas_aba_epoch;
 }
 
-void market::try_destroy_arena ( arena* a, uintptr_t aba_epoch ) {
+void market::try_destroy_arena ( arena* a, uintptr_t aba_epoch, unsigned priority_level ) {
     bool locked = true;
     __TBB_ASSERT( a, NULL );
     // we hold reference to the market, so it cannot be destroyed at any moment here
@@ -321,14 +321,14 @@ void market::try_destroy_arena ( arena* a, uintptr_t aba_epoch ) {
     __TBB_ASSERT( my_ref_count!=0, NULL );
     my_arenas_list_mutex.lock();
     assert_market_valid();
-        arena_list_type::iterator it = my_arenas[a->my_priority_level].begin();
-        for ( ; it != my_arenas[a->my_priority_level].end(); ++it ) {
+        arena_list_type::iterator it = my_arenas[priority_level].begin();
+        for ( ; it != my_arenas[priority_level].end(); ++it ) {
             if ( a == &*it ) {
                 if ( it->my_aba_epoch == aba_epoch ) {
                     // Arena is alive
                     if ( !a->my_num_workers_requested && !a->my_references.load(std::memory_order_relaxed) ) {
                         __TBB_ASSERT(
-                            !a->my_num_workers_allotted &&
+                            !a->my_num_workers_allotted.load(std::memory_order_relaxed) &&
                             (a->my_pool_state == arena::SNAPSHOT_EMPTY || !a->my_max_num_workers),
                             "Inconsistent arena state"
                         );
@@ -364,7 +364,7 @@ arena* market::arena_in_need ( arena_list_type* arenas, arena* hint ) {
             } while ( arenas[curr_priority_level].empty() );
             it = arenas[curr_priority_level].begin();
         }
-        if( a.num_workers_active() < a.my_num_workers_allotted ) {
+        if( a.num_workers_active() < a.my_num_workers_allotted.load(std::memory_order_relaxed) ) {
             a.my_references += arena::ref_worker;
             return &a;
         }
@@ -389,14 +389,21 @@ int market::update_allotment ( arena_list_type* arenas, int workers_demand, int
     int unassigned_workers = max_workers;
     int assigned = 0;
     int carry = 0;
+    unsigned max_priority_level = num_priority_levels;
     for (unsigned list_idx = 0; list_idx < num_priority_levels; ++list_idx ) {
         int assigned_per_priority = 0;
         for (arena_list_type::iterator it = arenas[list_idx].begin(); it != arenas[list_idx].end(); ++it) {
             arena& a = *it;
-            if (a.my_num_workers_requested <= 0) {
-                __TBB_ASSERT(!a.my_num_workers_allotted, nullptr);
+            __TBB_ASSERT(a.my_num_workers_requested >= 0 && a.my_num_workers_requested <= int(a.my_max_num_workers), nullptr);
+            if (a.my_num_workers_requested == 0) {
+                __TBB_ASSERT(!a.my_num_workers_allotted.load(std::memory_order_relaxed), nullptr);
                 continue;
             }
+
+            if (max_priority_level == num_priority_levels) {
+                max_priority_level = list_idx;
+            }
+
             int allotted = 0;
 #if __TBB_ENQUEUE_ENFORCED_CONCURRENCY
             if (my_num_workers_soft_limit.load(std::memory_order_relaxed) == 0) {
@@ -412,7 +419,8 @@ int market::update_allotment ( arena_list_type* arenas, int workers_demand, int
                 // a.my_num_workers_requested may temporarily exceed a.my_max_num_workers
                 allotted = min(allotted, (int)a.my_max_num_workers);
             }
-            a.my_num_workers_allotted = allotted;
+            a.my_num_workers_allotted.store(allotted, std::memory_order_relaxed);
+            a.my_is_top_priority.store(list_idx == max_priority_level, std::memory_order_relaxed);
             assigned += allotted;
             assigned_per_priority += allotted;
         }
@@ -505,19 +513,26 @@ void market::adjust_demand ( arena& a, int delta ) {
     if ( !delta )
         return;
     my_arenas_list_mutex.lock();
-    int prev_req = a.my_num_workers_requested;
-    a.my_num_workers_requested += delta;
-    if ( a.my_num_workers_requested <= 0 ) {
-        a.my_num_workers_allotted = 0;
-        if ( prev_req <= 0 ) {
-            my_arenas_list_mutex.unlock();
-            return;
-        }
-        delta = -prev_req;
+    a.my_total_num_workers_requested += delta;
+    int target_workers = 0;
+    // Cap target_workers into interval [0, a.my_max_num_workers]
+    if (a.my_total_num_workers_requested > 0) {
+        target_workers = a.my_total_num_workers_requested < int(a.my_max_num_workers) ?
+            a.my_total_num_workers_requested : a.my_max_num_workers;
     }
-    else if ( prev_req < 0 ) {
-        delta = a.my_num_workers_requested;
+
+    delta = target_workers - a.my_num_workers_requested;
+
+    if (delta == 0) {
+        my_arenas_list_mutex.unlock();
+        return;
+    }
+
+    a.my_num_workers_requested += delta;
+    if (a.my_num_workers_requested == 0) {
+        a.my_num_workers_allotted.store(0, std::memory_order_relaxed);
     }
+
     my_total_demand += delta;
     my_priority_level_demand[a.my_priority_level] += delta;
     unsigned effective_soft_limit = my_num_workers_soft_limit.load(std::memory_order_relaxed);
@@ -540,9 +555,14 @@ void market::adjust_demand ( arena& a, int delta ) {
     my_num_workers_requested += delta;
     __TBB_ASSERT( my_num_workers_requested <= (int)effective_soft_limit, NULL );
 
+    int target_epoch = my_adjust_demand_target_epoch++;
+
     my_arenas_list_mutex.unlock();
+
+    spin_wait_until_eq(my_adjust_demand_current_epoch, target_epoch);
     // Must be called outside of any locks
     my_server->adjust_job_count_estimate( delta );
+    my_adjust_demand_current_epoch.store(target_epoch + 1, std::memory_order_release);
 }
 
 void market::process( job& j ) {
diff --git a/src/tbb/market.h b/src/tbb/market.h
index b4d9f1152c..3572641659 100644
--- a/src/tbb/market.h
+++ b/src/tbb/market.h
@@ -100,6 +100,12 @@ class market : no_copy, rml::tbb_client {
     //! Number of workers currently requested from RML
     int my_num_workers_requested;
 
+    //! The target serialization epoch for callers of adjust_job_count_estimate
+    int my_adjust_demand_target_epoch;
+
+    //! The current serialization epoch for callers of adjust_job_count_estimate
+    std::atomic<int> my_adjust_demand_current_epoch;
+
     //! First unused index of worker
     /** Used to assign indices to the new workers coming from RML, and busy part
         of my_workers array. **/
@@ -215,7 +221,7 @@ class market : no_copy, rml::tbb_client {
                                  unsigned arena_index, std::size_t stack_size );
 
     //! Removes the arena from the market's list
-    void try_destroy_arena ( arena*, uintptr_t aba_epoch );
+    void try_destroy_arena ( arena*, uintptr_t aba_epoch, unsigned pririty_level );
 
     //! Removes the arena from the market's list
     void detach_arena ( arena& );
diff --git a/src/tbb/misc.cpp b/src/tbb/misc.cpp
index 34a150d7bf..261ae147ec 100644
--- a/src/tbb/misc.cpp
+++ b/src/tbb/misc.cpp
@@ -74,50 +74,57 @@ void PrintExtraVersionInfo( const char* category, const char* format, ... ) {
     }
 }
 
-void PrintRMLVersionInfo( void* arg, const char* server_info ) {
-    PrintExtraVersionInfo( server_info, (const char *)arg );
-}
-
 //! check for transaction support.
 #if _MSC_VER
 #include <intrin.h> // for __cpuid
 #endif
-bool cpu_has_speculation() {
-#if (__TBB_x86_32 || __TBB_x86_64)
-#if (__INTEL_COMPILER || __GNUC__ || _MSC_VER || __SUNPRO_CC)
-    bool result = false;
-    const int rtm_ebx_mask = 1<<11;
+
+#if __TBB_x86_32 || __TBB_x86_64
+void check_cpuid(int leaf, int sub_leaf, int registers[4]) {
 #if _MSC_VER
-    int info[4] = {0,0,0,0};
-    const int reg_ebx = 1;
-    __cpuidex(info, 7, 0);
-    result = (info[reg_ebx] & rtm_ebx_mask)!=0;
-#elif __GNUC__ || __SUNPRO_CC
-    int32_t reg_ebx = 0;
-    int32_t reg_eax = 7;
-    int32_t reg_ecx = 0;
-    __asm__ __volatile__ ( "movl %%ebx, %%esi\n"
-                           "cpuid\n"
-                           "movl %%ebx, %0\n"
-                           "movl %%esi, %%ebx\n"
-                           : "=a"(reg_ebx) : "0" (reg_eax), "c" (reg_ecx) : "esi",
-#if __TBB_x86_64
-                           "ebx",
+    __cpuidex(registers, leaf, sub_leaf);
+#else
+    int reg_eax = 0;
+    int reg_ebx = 0;
+    int reg_ecx = 0;
+    int reg_edx = 0;
+#if __TBB_x86_32 && __PIC__
+    // On 32-bit systems with position-independent code GCC fails to work around the stuff in EBX
+    // register. We help it using backup and restore.
+    __asm__("mov %%ebx, %%esi\n\t"
+            "cpuid\n\t"
+            "xchg %%ebx, %%esi"
+            : "=a"(reg_eax), "=S"(reg_ebx), "=c"(reg_ecx), "=d"(reg_edx)
+            : "0"(leaf), "2"(sub_leaf) // read value from eax and ecx
+    );
+#else
+    __asm__("cpuid"
+            : "=a"(reg_eax), "=b"(reg_ebx), "=c"(reg_ecx), "=d"(reg_edx)
+            : "0"(leaf), "2"(sub_leaf) // read value from eax and ecx
+    );
 #endif
-                           "edx"
-                           );
-    result = (reg_ebx & rtm_ebx_mask)!=0 ;
+    registers[0] = reg_eax;
+    registers[1] = reg_ebx;
+    registers[2] = reg_ecx;
+    registers[3] = reg_edx;
 #endif
-    return result;
-#else
-    #error Speculation detection not enabled for compiler
-#endif /* __INTEL_COMPILER || __GNUC__ || _MSC_VER */
-#else  /* (__TBB_x86_32 || __TBB_x86_64) */
-    return false;
+}
+#endif
+
+void detect_cpu_features(cpu_features_type& cpu_features) {
+    suppress_unused_warning(cpu_features);
+#if __TBB_x86_32 || __TBB_x86_64
+    const int rtm_ebx_mask = 1 << 11;
+    const int waitpkg_ecx_mask = 1 << 5;
+    int registers[4] = {0};
+
+    // Check RTM and WAITPKG
+    check_cpuid(7, 0, registers);
+    cpu_features.rtm_enabled = (registers[1] & rtm_ebx_mask) != 0;
+    cpu_features.waitpkg_enabled = (registers[2] & waitpkg_ecx_mask) != 0;
 #endif /* (__TBB_x86_32 || __TBB_x86_64) */
 }
 
 } // namespace r1
 } // namespace detail
 } // namespace tbb
-
diff --git a/src/tbb/misc.h b/src/tbb/misc.h
index 9773e9ba17..ba31e50a66 100644
--- a/src/tbb/misc.h
+++ b/src/tbb/misc.h
@@ -211,7 +211,12 @@ T1 atomic_update(std::atomic<T1>& dst, T1 newValue, Pred compare) {
     inline void destroy_process_mask(){}
 #endif /* __TBB_USE_OS_AFFINITY_SYSCALL */
 
-bool cpu_has_speculation();
+struct cpu_features_type {
+    bool rtm_enabled{false};
+    bool waitpkg_enabled{false};
+};
+
+void detect_cpu_features(cpu_features_type& cpu_features);
 
 #if __TBB_NUMA_SUPPORT
 class binding_handler;
@@ -273,6 +278,7 @@ static inline void abort_transaction() {
 #endif
 }
 
+#if TBB_USE_ASSERT
 static inline unsigned char is_in_transaction() {
 #if __TBB_TSX_INTRINSICS_PRESENT
     return _xtest();
@@ -280,6 +286,7 @@ static inline unsigned char is_in_transaction() {
     return 0;
 #endif
 }
+#endif // TBB_USE_ASSERT
 
 } // namespace r1
 } // namespace detail
diff --git a/src/tbb/observer_proxy.cpp b/src/tbb/observer_proxy.cpp
index 66f4acb63b..6cae72438f 100644
--- a/src/tbb/observer_proxy.cpp
+++ b/src/tbb/observer_proxy.cpp
@@ -40,7 +40,7 @@ observer_proxy::observer_proxy( d1::task_scheduler_observer& tso )
 #endif /* TBB_USE_ASSERT */
 }
 
-observer_proxy::~observer_proxy () {
+observer_proxy::~observer_proxy() {
     __TBB_ASSERT( !my_ref_count, "Attempt to destroy proxy still in use" );
     poison_value(my_ref_count);
     poison_pointer(my_prev);
@@ -50,7 +50,7 @@ observer_proxy::~observer_proxy () {
 #endif /* TBB_USE_ASSERT */
 }
 
-void observer_list::clear () {
+void observer_list::clear() {
     // Though the method will work fine for the empty list, we require the caller
     // to check for the list emptiness before invoking it to avoid extra overhead.
     __TBB_ASSERT( !empty(), NULL );
@@ -72,15 +72,25 @@ void observer_list::clear () {
             __TBB_ASSERT(is_alive(p->my_ref_count), "Observer's proxy died prematurely");
             __TBB_ASSERT(p->my_ref_count.load(std::memory_order_relaxed) == 1, "Reference for observer is missing");
             poison_pointer(p->my_observer);
-            poison_value(p->my_ref_count);
             remove(p);
+            --p->my_ref_count;
             delete p;
         }
-        __TBB_ASSERT(my_head == nullptr && my_tail == nullptr, nullptr);
     }
+
+    // If observe(false) is called concurrently with the destruction of the arena,
+    // need to wait until all proxies are removed.
+    for (atomic_backoff backoff; ; backoff.pause()) {
+        scoped_lock lock(mutex(), /*is_writer=*/false);
+        if (my_head == nullptr) {
+            break;
+        }
+    }
+
+    __TBB_ASSERT(my_head == nullptr && my_tail == nullptr, nullptr);
 }
 
-void observer_list::insert ( observer_proxy* p ) {
+void observer_list::insert( observer_proxy* p ) {
     scoped_lock lock(mutex(), /*is_writer=*/true);
     if (my_head) {
         p->my_prev = my_tail;
@@ -217,6 +227,9 @@ void observer_list::do_notify_exit_observers(observer_proxy* last, bool worker)
                         remove_ref_fast(p);
                         if (p) {
                             lock.release();
+                            if (p != prev && prev) {
+                                remove_ref(prev);
+                            }
                             remove_ref(p);
                         }
                         return;
diff --git a/src/tbb/observer_proxy.h b/src/tbb/observer_proxy.h
index ca0e60e39a..33ec92b7e6 100644
--- a/src/tbb/observer_proxy.h
+++ b/src/tbb/observer_proxy.h
@@ -35,10 +35,10 @@ class observer_list {
     typedef aligned_space<spin_rw_mutex>  my_mutex_type;
 
     //! Pointer to the head of this list.
-    observer_proxy* my_head;
+    observer_proxy* my_head{nullptr};
 
     //! Pointer to the tail of this list.
-    observer_proxy* my_tail;
+    observer_proxy* my_tail{nullptr};
 
     //! Mutex protecting this list.
     my_mutex_type my_mutex;
@@ -57,7 +57,7 @@ class observer_list {
     void do_notify_exit_observers( observer_proxy* last, bool worker );
 
 public:
-    observer_list () : my_head(NULL), my_tail(NULL) {}
+    observer_list () = default;
 
     //! Removes and destroys all observer proxies from the list.
     /** Cannot be used concurrently with other methods. **/
@@ -79,11 +79,11 @@ class observer_list {
     //! Accessor to the reader-writer mutex associated with the list.
     spin_rw_mutex& mutex () { return my_mutex.begin()[0]; }
 
-    bool empty () const { return my_head == NULL; }
+    bool empty () const { return my_head == nullptr; }
 
     //! Call entry notifications on observers added after last was notified.
     /** Updates last to become the last notified observer proxy (in the global list)
-        or leaves it to be NULL. The proxy has its refcount incremented. **/
+        or leaves it to be nullptr. The proxy has its refcount incremented. **/
     inline void notify_entry_observers( observer_proxy*& last, bool worker );
 
     //! Call exit notifications on last and observers added before it.
@@ -120,7 +120,7 @@ class observer_proxy {
     ~observer_proxy();
 }; // class observer_proxy
 
-inline void observer_list::remove_ref_fast( observer_proxy*& p ) {
+void observer_list::remove_ref_fast( observer_proxy*& p ) {
     if( p->my_observer ) {
         // Can decrement refcount quickly, as it cannot drop to zero while under the lock.
         std::uintptr_t r = --p->my_ref_count;
@@ -131,13 +131,13 @@ inline void observer_list::remove_ref_fast( observer_proxy*& p ) {
     }
 }
 
-inline void observer_list::notify_entry_observers(observer_proxy*& last, bool worker) {
+void observer_list::notify_entry_observers(observer_proxy*& last, bool worker) {
     if (last == my_tail)
         return;
     do_notify_entry_observers(last, worker);
 }
 
-inline void observer_list::notify_exit_observers( observer_proxy*& last, bool worker ) {
+void observer_list::notify_exit_observers( observer_proxy*& last, bool worker ) {
     if (last == nullptr) {
         return;
     }
diff --git a/src/tbb/queuing_rw_mutex.cpp b/src/tbb/queuing_rw_mutex.cpp
index 5d34909514..24c2d11516 100644
--- a/src/tbb/queuing_rw_mutex.cpp
+++ b/src/tbb/queuing_rw_mutex.cpp
@@ -532,18 +532,23 @@ struct queuing_rw_mutex_impl {
 void __TBB_EXPORTED_FUNC acquire(d1::queuing_rw_mutex& m, d1::queuing_rw_mutex::scoped_lock& s, bool write) {
     queuing_rw_mutex_impl::acquire(m, s, write);
 }
+
 bool __TBB_EXPORTED_FUNC try_acquire(d1::queuing_rw_mutex& m, d1::queuing_rw_mutex::scoped_lock& s, bool write) {
     return queuing_rw_mutex_impl::try_acquire(m, s, write);
 }
+
 void __TBB_EXPORTED_FUNC release(d1::queuing_rw_mutex::scoped_lock& s) {
     queuing_rw_mutex_impl::release(s);
 }
+
 bool __TBB_EXPORTED_FUNC upgrade_to_writer(d1::queuing_rw_mutex::scoped_lock& s) {
     return queuing_rw_mutex_impl::upgrade_to_writer(s);
 }
+
 bool __TBB_EXPORTED_FUNC downgrade_to_reader(d1::queuing_rw_mutex::scoped_lock& s) {
     return queuing_rw_mutex_impl::downgrade_to_reader(s);
 }
+
 void __TBB_EXPORTED_FUNC construct(d1::queuing_rw_mutex& m) {
     queuing_rw_mutex_impl::construct(m);
 }
diff --git a/src/tbb/rml_tbb.cpp b/src/tbb/rml_tbb.cpp
index bd9a051646..48edd7f6ca 100644
--- a/src/tbb/rml_tbb.cpp
+++ b/src/tbb/rml_tbb.cpp
@@ -106,12 +106,6 @@ ::rml::factory::status_type FACTORY::make_server( SERVER*& s, CLIENT& c) {
     return (*my_make_server_routine)(*this,s,c);
 }
 
-void FACTORY::call_with_server_info( ::rml::server_info_callback_t cb, void* arg ) const {
-    // Failure of following assertion means that factory was not successfully opened.
-    __TBB_ASSERT_EX( my_call_with_server_info_routine, NULL );
-    (*my_call_with_server_info_routine)( cb, arg );
-}
-
 } // namespace rml
 } // namespace r1
 } // namespace detail
diff --git a/src/tbb/rml_tbb.h b/src/tbb/rml_tbb.h
index f873085c9d..5dfce10d4e 100644
--- a/src/tbb/rml_tbb.h
+++ b/src/tbb/rml_tbb.h
@@ -84,9 +84,6 @@ class tbb_factory: public ::rml::factory {
 
     //! Close factory
     void close();
-
-    //! Call the callback with the server build info
-    void call_with_server_info( ::rml::server_info_callback_t cb, void* arg ) const;
 };
 
 } // namespace rml
diff --git a/src/tbb/rtm_rw_mutex.cpp b/src/tbb/rtm_rw_mutex.cpp
index 3563840f31..4450d3cfb1 100644
--- a/src/tbb/rtm_rw_mutex.cpp
+++ b/src/tbb/rtm_rw_mutex.cpp
@@ -60,7 +60,7 @@ struct rtm_rw_mutex_impl {
         }
         s.m_transaction_state = d1::rtm_rw_mutex::rtm_type::rtm_not_in_mutex;
     }
-    
+
     //! Acquire write lock on the given mutex.
     static void acquire_writer(d1::rtm_rw_mutex& m, d1::rtm_rw_mutex::scoped_lock& s, bool only_speculate) {
         __TBB_ASSERT(s.m_transaction_state == d1::rtm_rw_mutex::rtm_type::rtm_not_in_mutex, "scoped_lock already in transaction");
@@ -89,7 +89,7 @@ struct rtm_rw_mutex_impl {
                 ++num_retries;
             } while((abort_code & speculation_retry) != 0 && (num_retries < retry_threshold_write));
         }
-    
+
         if(only_speculate) return;
         s.m_mutex = &m;                                                          // should apply a real try_lock...
         s.m_mutex->lock();                                                       // kill transactional writers
@@ -98,7 +98,7 @@ struct rtm_rw_mutex_impl {
         s.m_transaction_state = d1::rtm_rw_mutex::rtm_type::rtm_real_writer;
         return;
     }
-    
+
     //! Acquire read lock on given mutex.
     //  only_speculate : true if we are doing a try_acquire.  If true and we fail to speculate, don't
     //     really acquire the lock, return and do a try_acquire on the contained spin_rw_mutex.  If
@@ -129,8 +129,8 @@ struct rtm_rw_mutex_impl {
                 }
                 // fallback path
                 // retry only if there is any hope of getting into a transaction soon
-                // Retry in the following cases (from Section 8.3.5 of Intel(R)
-                // Architecture Instruction Set Extensions Programming Reference):
+                // Retry in the following cases (from Section 8.3.5 of
+                // Intel(R) Architecture Instruction Set Extensions Programming Reference):
                 // 1. abort caused by XABORT instruction (bit 0 of EAX register is set)
                 // 2. the transaction may succeed on a retry (bit 1 of EAX register is set)
                 // 3. if another logical processor conflicted with a memory address
@@ -139,13 +139,13 @@ struct rtm_rw_mutex_impl {
                 ++num_retries;
             } while((abort_code & speculation_retry) != 0 && (num_retries < retry_threshold_read));
         }
-    
+
         if(only_speculate) return;
         s.m_mutex = &m;
         s.m_mutex->lock_shared();
         s.m_transaction_state = d1::rtm_rw_mutex::rtm_type::rtm_real_reader;
     }
-    
+
     //! Upgrade reader to become a writer.
     /** Returns whether the upgrade happened without releasing and re-acquiring the lock */
     static bool upgrade(d1::rtm_rw_mutex::scoped_lock& s) {
@@ -175,7 +175,7 @@ struct rtm_rw_mutex_impl {
             return false;
         }
     }
-    
+
     //! Downgrade writer to a reader.
     static bool downgrade(d1::rtm_rw_mutex::scoped_lock& s) {
         switch (s.m_transaction_state) {
@@ -193,7 +193,7 @@ struct rtm_rw_mutex_impl {
             return false;
         }
     }
-    
+
     //! Try to acquire write lock on the given mutex.
     //  There may be reader(s) which acquired the spin_rw_mutex, as well as possibly
     //  transactional reader(s).  If this is the case, the acquire will fail, and assigning
@@ -216,7 +216,7 @@ struct rtm_rw_mutex_impl {
         }
         return false;
     }
-    
+
     //! Try to acquire read lock on the given mutex.
     static bool try_acquire_reader(d1::rtm_rw_mutex& m, d1::rtm_rw_mutex::scoped_lock& s) {
         // speculatively acquire the lock. If this fails, do try_lock_shared on the spin_rw_mutex.
diff --git a/src/tbb/scheduler_common.h b/src/tbb/scheduler_common.h
index 0f1eedd779..5581569607 100644
--- a/src/tbb/scheduler_common.h
+++ b/src/tbb/scheduler_common.h
@@ -189,7 +189,8 @@ inline std::uint64_t machine_time_stamp() {
     return (std::uint64_t(hi) << 32) | lo;
 #endif
 }
-inline void prolonged_pause() {
+
+inline void prolonged_pause_impl() {
     // Assumption based on practice: 1000-2000 ticks seems to be a suitable invariant for the
     // majority of platforms. Currently, skip platforms that define __TBB_STEALING_PAUSE
     // because these platforms require very careful tuning.
@@ -206,7 +207,7 @@ inline void prolonged_pause() {
     } while (prev < finish);
 }
 #else
-inline void prolonged_pause() {
+inline void prolonged_pause_impl() {
 #ifdef __TBB_ipf
     static const long PauseTime = 1500;
 #else
@@ -215,7 +216,22 @@ inline void prolonged_pause() {
     // TODO IDEA: Update PauseTime adaptively?
     machine_pause(PauseTime);
 }
-#endif // (_WIN32 || _WIN64 || __linux__) && (__TBB_x86_32 || __TBB_x86_64)
+#endif
+
+inline void prolonged_pause() {
+#if __TBB_WAITPKG_INTRINSICS_PRESENT && (_WIN32 || _WIN64 || __linux__) && (__TBB_x86_32 || __TBB_x86_64)
+    if (governor::wait_package_enabled()) {
+        std::uint64_t time_stamp = machine_time_stamp();
+        // _tpause function directs the processor to enter an implementation-dependent optimized state
+        // until the Time Stamp Counter reaches or exceeds the value specified in second parameter.
+        // Constant "700" is ticks to wait for.
+        // First parameter 0 selects between a lower power (cleared) or faster wakeup (set) optimized state.
+        _tpause(0, time_stamp + 700);
+    }
+    else
+#endif
+    prolonged_pause_impl();
+}
 
 class stealing_loop_backoff {
     const int my_pause_threshold;
diff --git a/src/tbb/task.cpp b/src/tbb/task.cpp
index 91bbc68a18..f2ce477cd8 100644
--- a/src/tbb/task.cpp
+++ b/src/tbb/task.cpp
@@ -20,6 +20,7 @@
 #include "arena.h"
 #include "thread_data.h"
 #include "task_dispatcher.h"
+#include "waiters.h"
 #include "itt_notify.h"
 
 #include "tbb/detail/_task.h"
@@ -30,6 +31,72 @@
 
 namespace tbb {
 namespace detail {
+
+namespace d1 {
+
+bool wait_context::is_locked() {
+    return m_ref_count.load(std::memory_order_relaxed) & lock_flag;
+}
+
+void wait_context::lock() {
+    atomic_backoff backoff;
+
+    auto try_lock = [&] { return !(m_ref_count.fetch_or(lock_flag) & lock_flag); };
+
+    // While is_locked return true try_lock is not invoked
+    while (is_locked() || !try_lock()) {
+        backoff.pause();
+    }
+}
+
+void wait_context::unlock() {
+    __TBB_ASSERT(is_locked(), NULL);
+    m_ref_count.fetch_and(~lock_flag);
+}
+
+bool wait_context::publish_wait_list() {
+    // Try to add waiter_flag to the ref_counter
+    // Important : This function should never add waiter_flag if work is done otherwise waiter_flag will be never removed
+
+    auto expected = m_ref_count.load(std::memory_order_relaxed);
+    __TBB_ASSERT(is_locked() || m_version_and_traits == 0, NULL);
+
+    while (!(expected & waiter_flag) && continue_execution()) {
+        if (m_ref_count.compare_exchange_strong(expected, expected | waiter_flag)) {
+            __TBB_ASSERT(!(expected & waiter_flag), NULL);
+            expected |= waiter_flag;
+            break;
+        }
+    }
+
+    // There is waiter_flag in ref_count
+    return expected & waiter_flag;
+}
+
+void wait_context::unregister_waiter(r1::wait_node& node) {
+    lock_guard lock(*this);
+
+    if (m_wait_head != nullptr) {
+        if (m_wait_head == &node) {
+            m_wait_head = node.my_next;
+        }
+        node.unlink();
+    }
+}
+
+void wait_context::notify_waiters() {
+    lock_guard lock(*this);
+
+    if (m_wait_head != nullptr) {
+        m_wait_head->notify_all(*this);
+        m_wait_head = nullptr;
+    }
+
+    m_ref_count.store(m_ref_count.load(std::memory_order_relaxed) & ~waiter_flag, std::memory_order_relaxed);
+}
+
+} // namespace d1
+
 namespace r1 {
 
 //------------------------------------------------------------------------
@@ -153,10 +220,24 @@ void thread_data::do_post_resume_action() {
     __TBB_ASSERT(my_post_resume_arg, "The post resume action must have an argument");
 
     switch (my_post_resume_action) {
-    case post_resume_action::abandon:
+    case post_resume_action::register_waiter:
     {
-        d1::wait_context& wo = *static_cast<d1::wait_context*>(my_post_resume_arg);
-        wo.abandon_wait();
+        auto& data = *static_cast<thread_data::register_waiter_data*>(my_post_resume_arg);
+
+        // Support of backward compatibility
+        if (data.wo.m_version_and_traits == 0) {
+            data.wo.m_wait_head = reinterpret_cast<wait_node*>(data.node.my_suspend_point);
+            if (!data.wo.publish_wait_list()) {
+                r1::resume(data.node.my_suspend_point);
+            }
+            break;
+        }
+
+        auto wait_condition = [&data] { return data.wo.continue_execution(); };
+        if (!data.wo.try_register_waiter(data.node, wait_condition)) {
+            r1::resume(data.node.my_suspend_point);
+        }
+
         break;
     }
     case post_resume_action::callback:
@@ -206,6 +287,12 @@ suspend_point_type* current_suspend_point() {
 
 #endif /* __TBB_RESUMABLE_TASKS */
 
+void notify_waiters(d1::wait_context& wc) {
+    __TBB_ASSERT(wc.m_version_and_traits > 0, NULL);
+
+    wc.notify_waiters();
+}
+
 } // namespace r1
 } // namespace detail
 } // namespace tbb
diff --git a/src/tbb/task_dispatcher.cpp b/src/tbb/task_dispatcher.cpp
index 773cca6fd7..45d7883c73 100644
--- a/src/tbb/task_dispatcher.cpp
+++ b/src/tbb/task_dispatcher.cpp
@@ -15,6 +15,7 @@
 */
 
 #include "task_dispatcher.h"
+#include "waiters.h"
 
 namespace tbb {
 namespace detail {
@@ -150,37 +151,6 @@ d1::task_group_context* __TBB_EXPORTED_FUNC current_context() {
     }
 }
 
-class external_waiter {
-    d1::wait_context& m_wait_ctx;
-    stealing_loop_backoff my_backoff;
-public:
-    external_waiter(d1::wait_context& wo, int num_workers) : m_wait_ctx( wo ), my_backoff( num_workers ) {}
-
-    bool continue_execution(arena_slot& slot, d1::task*& t) const {
-        __TBB_ASSERT(t == nullptr, nullptr);
-        if (!m_wait_ctx.continue_execution())
-            return false;
-        t = get_self_recall_task(slot);
-        return true;
-    }
-
-    void pause() {
-        my_backoff.pause();
-    }
-
-    void reset_wait() {
-        my_backoff.reset_wait();
-    }
-
-    d1::wait_context* wait_ctx() {
-        return &m_wait_ctx;
-    }
-
-    static bool postpone_execution(d1::task&) {
-        return false;
-    }
-};
-
 void task_dispatcher::execute_and_wait(d1::task* t, d1::wait_context& wait_ctx, d1::task_group_context& w_ctx) {
     // Get an associated task dispatcher
     thread_data* tls = governor::get_thread_data();
@@ -195,10 +165,9 @@ void task_dispatcher::execute_and_wait(d1::task* t, d1::wait_context& wait_ctx,
     }
 
     // Waiting on special object tied to a waiting thread.
-    external_waiter waiter{ wait_ctx, int(tls->my_arena->my_num_slots) };
+    external_waiter waiter{ *tls->my_arena, wait_ctx };
     t = local_td.local_wait_for_all(t, waiter);
     __TBB_ASSERT_EX(t == nullptr, "External waiter must not leave dispatch loop with a task");
-    __TBB_ASSERT(wait_ctx.continue_execution() == false, "Thread can only leave dispatch loop when waiting object allowed this");
 
     // Master (external) thread couldn't exit the dispatch loop in an idle state
     if (local_td.m_thread_data->my_inbox.is_idle_state(true)) {
@@ -213,34 +182,6 @@ void task_dispatcher::execute_and_wait(d1::task* t, d1::wait_context& wait_ctx,
 
 #if __TBB_RESUMABLE_TASKS
 
-class coroutine_waiter {
-    stealing_loop_backoff my_backoff;
-public:
-    coroutine_waiter(int num_workers) : my_backoff(num_workers) {}
-
-    bool continue_execution(arena_slot& slot, d1::task*& t) const {
-        __TBB_ASSERT(t == nullptr, nullptr);
-        t = get_self_recall_task(slot);
-        return true;
-    }
-
-    void pause() {
-        my_backoff.pause();
-    }
-
-    void reset_wait() {
-        my_backoff.reset_wait();
-    }
-
-    d1::wait_context* wait_ctx() {
-        return nullptr;
-    }
-
-    static bool postpone_execution(d1::task& t) {
-        return task_accessor::is_resume_task(t);
-    }
-};
-
 #if _WIN32
 /* [[noreturn]] */ void __stdcall co_local_wait_for_all(void* arg) noexcept
 #else
@@ -267,7 +208,8 @@ class coroutine_waiter {
 
     // Endless loop here because coroutine could be reused
     for (;;) {
-        coroutine_waiter waiter(m_thread_data->my_arena->my_num_slots);
+        arena* a = m_thread_data->my_arena;
+        coroutine_waiter waiter(*a);
         d1::task* resume_task = local_wait_for_all(nullptr, waiter);
         assert_task_valid(resume_task);
         __TBB_ASSERT(this == m_thread_data->my_task_dispatcher, nullptr);
diff --git a/src/tbb/task_dispatcher.h b/src/tbb/task_dispatcher.h
index f9ac45a28c..89d99fbc21 100644
--- a/src/tbb/task_dispatcher.h
+++ b/src/tbb/task_dispatcher.h
@@ -22,6 +22,7 @@
 #include "tbb/global_control.h"
 
 #include "scheduler_common.h"
+#include "waiters.h"
 #include "arena_slot.h"
 #include "arena.h"
 #include "thread_data.h"
@@ -61,11 +62,13 @@ inline d1::task* get_self_recall_task(arena_slot& slot) {
 
 inline d1::task* suspend_point_type::resume_task::execute(d1::execution_data& ed) {
     execution_data_ext& ed_ext = static_cast<execution_data_ext&>(ed);
+    resume_node node{ed_ext.task_disp->get_suspend_point()};
+    thread_data::register_waiter_data wait_data{*ed_ext.wait_ctx, node};
+
     if (ed_ext.wait_ctx) {
         // The wait_ctx is present only in external_waiter. In that case we leave the current stack
         // in the abandoned state to resume when waiting completes.
-        ed_ext.wait_ctx->m_waiting_coroutine = ed_ext.task_disp->get_suspend_point();
-        ed_ext.task_disp->m_thread_data->set_post_resume_action(thread_data::post_resume_action::abandon, ed_ext.wait_ctx);
+        ed_ext.task_disp->m_thread_data->set_post_resume_action(thread_data::post_resume_action::register_waiter, &wait_data);
     } else {
         // If wait_ctx is null, it can be only a worker thread on outermost level because
         // coroutine_waiter interrupts bypass loop before the resume_task execution.
@@ -215,7 +218,7 @@ d1::task* task_dispatcher::receive_or_steal_task(
             break; // Stealing success, end of stealing attempt
         }
         // Nothing to do, pause a little.
-        waiter.pause();
+        waiter.pause(slot);
     } // end of nonlocal task retrieval loop
     if (inbox.is_idle_state(true)) {
         inbox.set_is_idle(false);
@@ -362,7 +365,11 @@ inline void task_dispatcher::recall_point() {
         __TBB_ASSERT(m_suspend_point->m_is_owner_recalled.load(std::memory_order_relaxed) == false, nullptr);
         d1::suspend([](suspend_point_type* sp) {
             sp->m_is_owner_recalled.store(true, std::memory_order_release);
+            sp->m_arena->my_sleep_monitors.notify([sp] (std::uintptr_t tag) {
+                return tag == std::uintptr_t(sp);
+            });
         });
+
         if (m_thread_data->my_inbox.is_idle_state(true)) {
             m_thread_data->my_inbox.set_is_idle(false);
         }
@@ -370,15 +377,6 @@ inline void task_dispatcher::recall_point() {
 }
 #endif /* __TBB_RESUMABLE_TASKS */
 
-template <typename Waiter>
-d1::task* local_wait_for_all(d1::task* t, Waiter& waiter) {
-    if (governor::is_itt_present()) {
-        return local_wait_for_all<true>(t, waiter);
-    } else {
-        return local_wait_for_all<false>(t, waiter);
-    }
-}
-
 #if __TBB_PREVIEW_CRITICAL_TASKS
 inline d1::task* task_dispatcher::get_critical_task(d1::task* t, execution_data_ext& ed, isolation_type isolation, bool critical_allowed) {
     __TBB_ASSERT( critical_allowed || !m_properties.critical_task_allowed, nullptr );
diff --git a/src/tbb/thread_data.h b/src/tbb/thread_data.h
index 86d61eeb38..ceda3e1fc9 100644
--- a/src/tbb/thread_data.h
+++ b/src/tbb/thread_data.h
@@ -38,6 +38,7 @@ class task;
 class arena_slot;
 class task_group_context;
 class task_dispatcher;
+struct resume_node;
 
 //------------------------------------------------------------------------
 // Thread Data
@@ -150,7 +151,7 @@ class thread_data : public ::rml::job
     //! The list of possible post resume actions.
     enum class post_resume_action {
         invalid,
-        abandon,
+        register_waiter,
         callback,
         cleanup,
         notify,
@@ -169,6 +170,11 @@ class thread_data : public ::rml::job
         }
     };
 
+    struct register_waiter_data {
+        d1::wait_context& wo;
+        resume_node& node;
+    };
+
     //! Suspends the current coroutine (task_dispatcher).
     void suspend(void* suspend_callback, void* user_callback);
 
diff --git a/src/tbb/waiters.h b/src/tbb/waiters.h
new file mode 100644
index 0000000000..d1882fa260
--- /dev/null
+++ b/src/tbb/waiters.h
@@ -0,0 +1,302 @@
+/*
+    Copyright (c) 2005-2020 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#ifndef _TBB_waiters_H
+#define _TBB_waiters_H
+
+#include "tbb/detail/_task.h"
+#include "scheduler_common.h"
+#include "arena.h"
+
+namespace tbb {
+namespace detail {
+namespace r1 {
+
+// Organizes wait list in wait_context
+struct wait_node {
+    virtual void notify(d1::wait_context& wo) = 0;
+
+    void notify_all(d1::wait_context& wo) {
+        wait_node* curr_node = this;
+        wait_node* next_node = nullptr;
+
+        for (; curr_node != nullptr; curr_node = next_node) {
+            next_node = curr_node->my_next;
+            curr_node->notify(wo);
+        }
+    }
+
+    void link(wait_node* next_node) {
+        my_next = next_node;
+        my_prev = nullptr;
+
+        if (next_node != nullptr) {
+            next_node->my_prev = this;
+        }
+    }
+
+    void unlink() {
+        if (my_prev) {
+            my_prev->my_next = my_next;
+        }
+
+        if (my_next) {
+            my_next->my_prev = my_prev;
+        }
+    }
+
+    wait_node* my_next{nullptr};
+    wait_node* my_prev{nullptr};
+};
+
+struct sleep_node : public wait_node {
+    sleep_node(arena& a) : my_arena(a)
+    {}
+
+    void notify(d1::wait_context& wo) override {
+        std::uintptr_t wait_tag = reinterpret_cast<std::uintptr_t>(&wo);
+        my_arena.my_sleep_monitors.notify(
+            [&wait_tag] (std::uintptr_t tag) {
+                return tag == wait_tag;
+            }
+        );
+    }
+
+    arena& my_arena;
+};
+
+struct resume_node : public wait_node {
+    resume_node(suspend_point_type* sp) : my_suspend_point(sp)
+    {}
+
+    void notify(d1::wait_context&) override {
+        r1::resume(my_suspend_point);
+    }
+
+    suspend_point_type* my_suspend_point;
+};
+
+inline d1::task* get_self_recall_task(arena_slot& slot);
+
+class waiter_base {
+public:
+    waiter_base(arena& a) : my_arena(a), my_backoff(int(a.my_num_slots)) {}
+
+    bool pause() {
+        if (my_backoff.pause()) {
+            my_arena.is_out_of_work();
+            return true;
+        }
+
+        return false;
+    }
+
+    void reset_wait() {
+        my_backoff.reset_wait();
+    }
+
+protected:
+    arena& my_arena;
+    stealing_loop_backoff my_backoff;
+};
+
+class outermost_worker_waiter : public waiter_base {
+public:
+    using waiter_base::waiter_base;
+
+    bool continue_execution(arena_slot& slot, d1::task*& t) const {
+        __TBB_ASSERT(t == nullptr, nullptr);
+
+        if (is_worker_should_leave(slot)) {
+            // Leave dispatch loop
+            return false;
+        }
+
+        t = get_self_recall_task(slot);
+        return true;
+    }
+
+    void pause(arena_slot&) {
+        waiter_base::pause();
+    }
+
+
+    d1::wait_context* wait_ctx() {
+        return nullptr;
+    }
+
+    static bool postpone_execution(d1::task&) {
+        return false;
+    }
+
+private:
+    using base_type = waiter_base;
+
+    bool is_worker_should_leave(arena_slot& slot) const {
+        bool is_top_priority_arena = my_arena.my_is_top_priority.load(std::memory_order_relaxed);
+        bool is_task_pool_empty = slot.task_pool.load(std::memory_order_relaxed) == EmptyTaskPool;
+
+        if (is_top_priority_arena) {
+            // Worker in most priority arena do not leave arena, until all work in task_pool is done
+            if (is_task_pool_empty && my_arena.is_recall_requested()) {
+                return true;
+            }
+        } else {
+            if (my_arena.is_recall_requested()) {
+                // If worker has work in task pool, we must notify other threads,
+                // because can appear missed wake up of other threads
+                if (!is_task_pool_empty) {
+                    my_arena.advertise_new_work<arena::wakeup>();
+                }
+                return true;
+            }
+        }
+
+        return false;
+    }
+};
+
+class sleep_waiter : public waiter_base {
+protected:
+    using waiter_base::waiter_base;
+
+    bool is_arena_empty() {
+        return my_arena.my_pool_state.load(std::memory_order_relaxed) == arena::SNAPSHOT_EMPTY;
+    }
+};
+
+class external_waiter : public sleep_waiter {
+public:
+    external_waiter(arena& a, d1::wait_context& wo)
+        : sleep_waiter(a), my_wait_ctx(wo)
+        {}
+
+    bool continue_execution(arena_slot& slot, d1::task*& t) const {
+        __TBB_ASSERT(t == nullptr, nullptr);
+        if (!my_wait_ctx.continue_execution())
+            return false;
+        t = get_self_recall_task(slot);
+        return true;
+    }
+
+    void pause(arena_slot&) {
+        if (!sleep_waiter::pause()) {
+            return;
+        }
+
+        // Support of backward compatibility
+        if (my_wait_ctx.m_version_and_traits == 0) {
+            return;
+        }
+
+        concurrent_monitor::thread_context thr_ctx;
+        auto sleep_condition = [&] { return is_arena_empty() && my_wait_ctx.continue_execution(); };
+
+        if (sleep_condition()) {
+            my_arena.my_sleep_monitors.prepare_wait(thr_ctx, reinterpret_cast<std::uintptr_t>(&my_wait_ctx));
+            sleep_node node{my_arena};
+
+            if (my_wait_ctx.try_register_waiter(node, sleep_condition)) {
+                my_arena.my_sleep_monitors.commit_wait(thr_ctx);
+                my_wait_ctx.unregister_waiter(node);
+            } else {
+                my_arena.my_sleep_monitors.cancel_wait(thr_ctx);
+            }
+        }
+    }
+
+    d1::wait_context* wait_ctx() {
+        return &my_wait_ctx;
+    }
+
+    static bool postpone_execution(d1::task&) {
+        return false;
+    }
+
+private:
+    d1::wait_context& my_wait_ctx;
+};
+
+#if __TBB_RESUMABLE_TASKS
+
+class coroutine_waiter : public sleep_waiter {
+public:
+    using sleep_waiter::sleep_waiter;
+
+    bool continue_execution(arena_slot& slot, d1::task*& t) const {
+        __TBB_ASSERT(t == nullptr, nullptr);
+        t = get_self_recall_task(slot);
+        return true;
+    }
+
+    void pause(arena_slot& slot) {
+        if (!sleep_waiter::pause()) {
+            return;
+        }
+
+        concurrent_monitor::thread_context thr_ctx;
+        suspend_point_type* sp = slot.default_task_dispatcher().m_suspend_point;
+
+        auto sleep_condition = [&] { return is_arena_empty() && !sp->m_is_owner_recalled.load(std::memory_order_relaxed); };
+
+        if (sleep_condition()) {
+            my_arena.my_sleep_monitors.prepare_wait(thr_ctx, (std::uintptr_t)sp);
+            if (sleep_condition()) {
+                my_arena.my_sleep_monitors.commit_wait(thr_ctx);
+            } else {
+                my_arena.my_sleep_monitors.cancel_wait(thr_ctx);
+            }
+        }
+    }
+
+    void reset_wait() {
+        my_backoff.reset_wait();
+    }
+
+    d1::wait_context* wait_ctx() {
+        return nullptr;
+    }
+
+    static bool postpone_execution(d1::task& t) {
+        return task_accessor::is_resume_task(t);
+    }
+};
+
+#endif // __TBB_RESUMABLE_TASKS
+
+} // namespace r1
+
+namespace d1 {
+template <typename F>
+bool wait_context::try_register_waiter(r1::wait_node& waiter, F&& condition) {
+    bool result = condition();
+    if (result) {
+        lock_guard lock(*this);
+
+        result = result && publish_wait_list();
+        if (result) {
+            waiter.link(m_wait_head);
+            m_wait_head = &waiter;
+        }
+    }
+    return result;
+}
+} // namespace d1
+
+} // namespace detail
+} // namespace tbb
+
+#endif // _TBB_waiters_H
diff --git a/src/tbbbind/CMakeLists.txt b/src/tbbbind/CMakeLists.txt
index 9f5485be72..cb0a95fd03 100644
--- a/src/tbbbind/CMakeLists.txt
+++ b/src/tbbbind/CMakeLists.txt
@@ -37,10 +37,12 @@ target_compile_options(tbbbind
 
 # Avoid use of target_link_libraries here as it changes /DEF option to \DEF on Windows.
 set_target_properties(tbbbind PROPERTIES
-    LINK_FLAGS ${TBB_LINK_DEF_FILE_FLAG}${CMAKE_CURRENT_SOURCE_DIR}/def/${TBB_DEF_FILE_PREFIX}-tbbbind.def
     DEFINE_SYMBOL ""
     VERSION ${TBBBIND_BINARY_VERSION}.${TBB_BINARY_MINOR_VERSION}
-    SOVERSION ${TBBBIND_BINARY_VERSION})
+    SOVERSION ${TBBBIND_BINARY_VERSION}
+    LINK_FLAGS ${TBB_LINK_DEF_FILE_FLAG}${CMAKE_CURRENT_SOURCE_DIR}/def/${TBB_DEF_FILE_PREFIX}-tbbbind.def
+    LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/def/${TBB_DEF_FILE_PREFIX}-tbbbind.def
+)
 
 # Prefer using target_link_options instead of target_link_libraries to specify link options because
 # target_link_libraries may incorrectly handle some options (on Windows, for example).
diff --git a/src/tbbmalloc/CMakeLists.txt b/src/tbbmalloc/CMakeLists.txt
index b3270a130e..4887a04f33 100644
--- a/src/tbbmalloc/CMakeLists.txt
+++ b/src/tbbmalloc/CMakeLists.txt
@@ -53,10 +53,12 @@ target_compile_options(tbbmalloc
 
 # Avoid use of target_link_libraries here as it changes /DEF option to \DEF on Windows.
 set_target_properties(tbbmalloc PROPERTIES
-    LINK_FLAGS ${TBB_LINK_DEF_FILE_FLAG}${CMAKE_CURRENT_SOURCE_DIR}/def/${TBB_DEF_FILE_PREFIX}-tbbmalloc.def
     DEFINE_SYMBOL ""
     VERSION ${TBBMALLOC_BINARY_VERSION}.${TBB_BINARY_MINOR_VERSION}
-    SOVERSION ${TBBMALLOC_BINARY_VERSION})
+    SOVERSION ${TBBMALLOC_BINARY_VERSION}
+    LINK_FLAGS ${TBB_LINK_DEF_FILE_FLAG}${CMAKE_CURRENT_SOURCE_DIR}/def/${TBB_DEF_FILE_PREFIX}-tbbmalloc.def
+    LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/def/${TBB_DEF_FILE_PREFIX}-tbbmalloc.def
+)
 
 # Prefer using target_link_options instead of target_link_libraries to specify link options because
 # target_link_libraries may incorrectly handle some options (on Windows, for example).
diff --git a/src/tbbmalloc/shared_utils.h b/src/tbbmalloc/shared_utils.h
index 6904663e85..4dc82a04d5 100644
--- a/src/tbbmalloc/shared_utils.h
+++ b/src/tbbmalloc/shared_utils.h
@@ -51,11 +51,6 @@ static inline T alignUpGeneric(T arg, uintptr_t alignment) {
     return arg;
 }
 
-template<typename T, size_t N> // generic function to find length of array
-inline size_t arrayLength(const T(&)[N]) {
-    return N;
-}
-
 /*
  * Compile time Log2 calculation
  */
diff --git a/src/tbbmalloc_proxy/CMakeLists.txt b/src/tbbmalloc_proxy/CMakeLists.txt
index 6ebc39ac2c..21dcda9516 100644
--- a/src/tbbmalloc_proxy/CMakeLists.txt
+++ b/src/tbbmalloc_proxy/CMakeLists.txt
@@ -44,6 +44,7 @@ if (UNIX AND NOT APPLE)
     # Avoid use of target_link_libraries here as it changes /DEF option to \DEF on Windows.
     set_target_properties(tbbmalloc_proxy PROPERTIES
         LINK_FLAGS ${TBB_LINK_DEF_FILE_FLAG}${CMAKE_CURRENT_SOURCE_DIR}/def/${TBB_DEF_FILE_PREFIX}-proxy.def
+        LINK_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/def/${TBB_DEF_FILE_PREFIX}-proxy.def
         DEFINE_SYMBOL "")
 endif()
 
diff --git a/src/tbbmalloc_proxy/proxy.cpp b/src/tbbmalloc_proxy/proxy.cpp
index 514ff71398..21f3b8f71d 100644
--- a/src/tbbmalloc_proxy/proxy.cpp
+++ b/src/tbbmalloc_proxy/proxy.cpp
@@ -347,7 +347,10 @@ void operator delete[](void* ptr, const std::nothrow_t&) noexcept {
 
 #include "function_replacement.h"
 
-#include "../tbbmalloc/shared_utils.h"
+template<typename T, size_t N> // generic function to find length of array
+inline size_t arrayLength(const T(&)[N]) {
+    return N;
+}
 
 void __TBB_malloc_safer_delete( void *ptr)
 {
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 8fcfb2b444..82a23de7b3 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -248,6 +248,13 @@ endif()
 
 tbb_add_test(SUBDIR tbb NAME test_tbb_header DEPENDENCIES TBB::tbb)
 target_sources(test_tbb_header PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/tbb/test_tbb_header_secondary.cpp)
+if (NOT "${TBB_OPENMP_FLAG}" STREQUAL "" AND NOT "${CMAKE_SYSTEM_PROCESSOR}" MATCHES "mips")
+    tbb_add_test(SUBDIR tbb NAME test_openmp DEPENDENCIES TBB::tbb)
+    set_target_properties(test_openmp PROPERTIES
+        COMPILE_FLAGS ${TBB_OPENMP_FLAG}
+        LINK_FLAGS ${TBB_OPENMP_FLAG}
+    )
+endif()
 
 # Define the conformance tests
 tbb_add_test(SUBDIR conformance NAME conformance_tick_count DEPENDENCIES TBB::tbb)
diff --git a/test/common/common_arena_constraints.h b/test/common/common_arena_constraints.h
index 502dcd2126..561a9cd505 100644
--- a/test/common/common_arena_constraints.h
+++ b/test/common/common_arena_constraints.h
@@ -14,7 +14,6 @@
     limitations under the License.
 */
 
-#define TBB_PREVIEW_NUMA_SUPPORT 1
 #include "tbb/detail/_config.h"
 
 #include "common/test.h"
diff --git a/test/common/concurrent_associative_common.h b/test/common/concurrent_associative_common.h
index 1dc1446eff..560f8af4e4 100644
--- a/test/common/concurrent_associative_common.h
+++ b/test/common/concurrent_associative_common.h
@@ -606,6 +606,8 @@ void test_basic_common()
     REQUIRE(!ccont.range().empty());
     REQUIRE((256 == CheckRecursiveRange<T,typename T::iterator>(cont.range()).first));
     REQUIRE((256 == CheckRecursiveRange<T,typename T::const_iterator>(ccont.range()).first));
+    REQUIRE(cont.range().grainsize() > 0);
+    REQUIRE(ccont.range().grainsize() > 0);
 
     // void swap(T&);
     cont.swap(newcont);
@@ -1486,4 +1488,58 @@ void test_insert_by_generic_pair() {
     CountingKey::reset();
 }
 
+template <typename Container>
+void test_swap_not_always_equal_allocator() {
+    static_assert(std::is_same<typename Container::allocator_type, NotAlwaysEqualAllocator<typename Container::value_type>>::value,
+                  "Incorrect allocator in not always equal test");
+    Container c1{};
+    Container c2{Value<Container>::make(1), Value<Container>::make(2)};
+
+    Container c1_copy = c1;
+    Container c2_copy = c2;
+
+    c1.swap(c2);
+
+    REQUIRE_MESSAGE(c1 == c2_copy, "Incorrect swap with not always equal allocator");
+    REQUIRE_MESSAGE(c2 == c1_copy, "Incorrect swap with not always equal allocator");
+}
+
+#if TBB_USE_EXCEPTIONS
+template <typename Container>
+void test_exception_on_copy_ctor() {
+    Container c1;
+    c1.emplace(Value<Container>::make(ThrowOnCopy{}));
+
+    using container_allocator_type = std::allocator<Container>;
+    using alloc_traits = std::allocator_traits<container_allocator_type>;
+    container_allocator_type container_allocator;
+    Container* c2_ptr = alloc_traits::allocate(container_allocator, 1);
+
+    ThrowOnCopy::activate();
+    // Test copy ctor
+    try {
+        alloc_traits::construct(container_allocator, c2_ptr, c1);
+    } catch ( int error_code ) {
+        REQUIRE_MESSAGE(error_code == ThrowOnCopy::error_code(), "Incorrect code was thrown");
+    }
+
+    REQUIRE_MESSAGE(c2_ptr->empty(), "Incorrect container state after throwing copy constructor");
+
+    alloc_traits::deallocate(container_allocator, c2_ptr, 1);
+    c2_ptr = alloc_traits::allocate(container_allocator, 1);
+
+    // Test copy ctor with allocator
+    try {
+        auto value_allocator = c1.get_allocator();
+        alloc_traits::construct(container_allocator, c2_ptr, c1, value_allocator);
+    } catch( int error_code ) {
+        REQUIRE_MESSAGE(error_code == ThrowOnCopy::error_code(), "Incorrect code was thrown");
+    }
+
+    REQUIRE_MESSAGE(c2_ptr->empty(), "Incorrect container state after throwing copy ctor with allocator");
+    alloc_traits::deallocate(container_allocator, c2_ptr, 1);
+    ThrowOnCopy::deactivate();
+}
+#endif // TBB_USE_EXCEPTIONS
+
 #endif // __TBB_test_common_concurrent_associative_common_H
diff --git a/test/common/concurrent_ordered_common.h b/test/common/concurrent_ordered_common.h
index b1eab37b53..36a122e2a0 100644
--- a/test/common/concurrent_ordered_common.h
+++ b/test/common/concurrent_ordered_common.h
@@ -65,6 +65,7 @@ void check_container_order( const Container& cont ) {
 template <typename Container>
 void test_ordered_methods() {
     Container cont;
+    const Container& ccont = cont;
 
     int r, random_threshold = 10, uncontained_key = random_threshold / 2;
     for (int i = 0; i < 100; ++i) {
@@ -97,12 +98,24 @@ void test_ordered_methods() {
             }
         }
 
+        typename Container::range_type cont_range = cont.range();
+        typename Container::const_range_type ccont_range = ccont.range();
+        REQUIRE_MESSAGE(cont_range.size() == ccont_range.size(), "Incorrect ordered container range size");
+        REQUIRE_MESSAGE(cont_range.size() == cont.size(), "Incorrect ordered container range size");
+
         typename Container::iterator l_bound = cont.lower_bound(key);
         typename Container::iterator u_bound = cont.upper_bound(key);
 
         REQUIRE_MESSAGE(l_bound == l_bound_check, "lower_bound() returned wrong iterator");
         REQUIRE_MESSAGE(u_bound == u_bound_check, "upper_bound() returned wrong iterator");
 
+        using const_iterator = typename Container::const_iterator;
+        const_iterator cl_bound = ccont.lower_bound(key);
+        const_iterator cu_bound = ccont.upper_bound(key);
+
+        REQUIRE_MESSAGE(cl_bound == const_iterator(l_bound), "lower_bound() const returned wrong iterator");
+        REQUIRE_MESSAGE(cu_bound == const_iterator(u_bound), "upper_bound() const returned wrong iterator");
+
         REQUIRE((l_bound == eq_range.first && u_bound == eq_range.second));
     }
 }
@@ -284,6 +297,8 @@ void check_heterogeneous_bound_functions() {
                   "incorrect key_type for heterogeneous bounds test");
     // Initialization
     Container c;
+    const Container& cc = c;
+
     int size = 10;
     for (int i = 0; i < size; ++i) {
         c.insert(Value<Container>::make(i));
@@ -300,6 +315,8 @@ void check_heterogeneous_bound_functions() {
 
         REQUIRE_MESSAGE(c.lower_bound(k) == c.lower_bound(key), "Incorrect heterogeneous lower_bound return value");
         REQUIRE_MESSAGE(c.upper_bound(k) == c.upper_bound(key), "Incorrect heterogeneous upper_bound return value");
+        REQUIRE_MESSAGE(cc.lower_bound(k) == cc.lower_bound(key), "Incorrect const heterogeneous lower_bound return value");
+        REQUIRE_MESSAGE(cc.upper_bound(k) == cc.upper_bound(key), "Incorrect const heterogeneous upper_bound return value");
     }
 }
 
diff --git a/test/common/concurrent_priority_queue_common.h b/test/common/concurrent_priority_queue_common.h
index a65206bae9..e7bcac2edd 100644
--- a/test/common/concurrent_priority_queue_common.h
+++ b/test/common/concurrent_priority_queue_common.h
@@ -125,12 +125,16 @@ void type_tester( const std::vector<ValueType>& vec, Compare comp ) {
     queue_type q4(q1);
     examine(q4, vec_sorted);
 
+    // Copy ctor with allocator
+    auto alloc = q1.get_allocator();
+    queue_type q4_alloc(q1, alloc);
+    examine(q4_alloc, vec_sorted);
+
     // Constructor from the half-open interval
     queue_type q5(vec.begin(), vec.end());
     examine(q5, vec_sorted);
 
     // Constructor from the allocator object
-    auto alloc = q1.get_allocator();
     queue_type q6(alloc);
     q6.assign(vec.begin(), vec.end());
     examine(q6, vec_sorted);
diff --git a/test/common/config.h b/test/common/config.h
index cec90e9db3..aa0dcf8852 100644
--- a/test/common/config.h
+++ b/test/common/config.h
@@ -45,13 +45,6 @@
         #define MAX_TUPLE_TEST_SIZE 5
     #endif
 #else
-    #if _MSC_VER
-// test sizes <= 8 don't get "decorated name length exceeded" errors. (disable : 4503)
-        #if MAX_TUPLE_TEST_SIZE > 8
-            #undef MAX_TUPLE_TEST_SIZE
-            #define MAX_TUPLE_TEST_SIZE 8
-        #endif
-    #endif
     #if MAX_TUPLE_TEST_SIZE > __TBB_VARIADIC_MAX
         #undef MAX_TUPLE_TEST_SIZE
         #define MAX_TUPLE_TEST_SIZE __TBB_VARIADIC_MAX
diff --git a/test/common/containers_common.h b/test/common/containers_common.h
index 4c97a80aae..3374c7bb45 100644
--- a/test/common/containers_common.h
+++ b/test/common/containers_common.h
@@ -128,6 +128,27 @@ void test_allocator_traits_support() {
     test_is_always_equal<always_equal_container_type>();
 }
 
+#if TBB_USE_EXCEPTIONS
+struct ThrowOnCopy {
+    static int error_code() { return 8; };
+    static bool is_active;
+    ThrowOnCopy() = default;
+    ThrowOnCopy( const ThrowOnCopy& ) {
+        if (is_active) {
+            throw error_code();
+        }
+    }
+    static void activate() { is_active = true; }
+    static void deactivate() { is_active = false; }
+
+    bool operator<( const ThrowOnCopy& ) const { return true; }
+    bool operator==( const ThrowOnCopy& ) const { return true; }
+}; // struct ThrowOnCopy
+
+bool ThrowOnCopy::is_active = false;
+
+#endif
+
 namespace std {
 template <typename T>
 struct hash<std::reference_wrapper<T>> {
@@ -144,6 +165,15 @@ struct hash<std::weak_ptr<T>> {
     }
 };
 
+#if TBB_USE_EXCEPTIONS
+template <>
+struct hash<ThrowOnCopy> {
+    std::size_t operator()( const ThrowOnCopy& ) const {
+        return 1;
+    }
+};
+#endif
+
 template <typename T>
 struct equal_to<std::weak_ptr<T>> {
     std::size_t operator()( const std::weak_ptr<T>& rhs, const std::weak_ptr<T>& lhs ) const {
diff --git a/test/common/graph_utils.h b/test/common/graph_utils.h
index f337e4f593..c50e177b80 100644
--- a/test/common/graph_utils.h
+++ b/test/common/graph_utils.h
@@ -314,9 +314,7 @@ struct harness_counting_receiver : public tbb::flow::receiver<T> {
         size_t n = my_count;
         CHECK( n == num_copies*max_value );
     }
-
-    void reset_receiver(tbb::flow::reset_flags /*f*/) override { my_count = 0; }
-};
+ };
 
 //! Counts the number of puts received
 template< typename T >
@@ -335,6 +333,11 @@ struct harness_mapped_receiver : public tbb::flow::receiver<T> {
        my_count = 0;
     }
 
+#if __INTEL_COMPILER <= 2021
+    // Suppress superfluous diagnostic about virtual keyword absence in a destructor of an inherited
+    // class while the parent class has the virtual keyword for the destrocutor.
+    virtual
+#endif
     ~harness_mapped_receiver() {
         if ( my_multiset ) delete my_multiset;
     }
@@ -374,7 +377,7 @@ struct harness_mapped_receiver : public tbb::flow::receiver<T> {
         }
     }
 
-    void reset_receiver(tbb::flow::reset_flags /*f*/) override {
+    void reset_receiver(tbb::flow::reset_flags /*f*/) {
         my_count = 0;
         if(my_multiset) delete my_multiset;
         my_multiset = new multiset_type;
diff --git a/test/common/memory_usage.h b/test/common/memory_usage.h
index 594491c2ce..88b5f2d7b6 100644
--- a/test/common/memory_usage.h
+++ b/test/common/memory_usage.h
@@ -30,7 +30,7 @@
 #include <errno.h>       /* for use in LinuxKernelVersion() */
 
 // Parse file utility for THP info
-#include "../../src/tbbmalloc/shared_utils.h"
+#include "src/tbbmalloc/shared_utils.h"
 
 #elif __APPLE__ && !__ARM_ARCH
 #include <unistd.h>
diff --git a/test/common/node_handling_support.h b/test/common/node_handling_support.h
index 0241c207c9..81dedb63bb 100644
--- a/test/common/node_handling_support.h
+++ b/test/common/node_handling_support.h
@@ -83,6 +83,7 @@ void test_node_handle( Container test_table ) {
 
     nh = test_table.unsafe_extract(test_table.begin());
     REQUIRE_MESSAGE(!nh.empty(), "Node handle: node_type object is empty after valid move assignment");
+    REQUIRE_MESSAGE(nh.get_allocator() == test_table.get_allocator(), "Node handle: node_type object allocator is incorrect");
     REQUIRE_MESSAGE(compare_handle_getters(nh, expected_value),
                     "Node handle: node_type object does not contains expected value after valid move assignment");
 
diff --git a/test/common/utils.h b/test/common/utils.h
index 972f754b5e..f9745b1758 100644
--- a/test/common/utils.h
+++ b/test/common/utils.h
@@ -397,6 +397,17 @@ tbb::blocked_range<T*> make_blocked_range( T(& array)[N] ) {
     return tbb::blocked_range<T*>(array, array + N);
 }
 
+template <typename T>
+void check_range_bounds_after_splitting( const tbb::blocked_range<T>& original, const tbb::blocked_range<T>& first,
+                                         const tbb::blocked_range<T>& second, const T& expected_first_end )
+{
+    REQUIRE(first.begin() == original.begin());
+    REQUIRE(first.end() == expected_first_end);
+    REQUIRE(second.begin() == expected_first_end);
+    REQUIRE(second.end() == original.end());
+    REQUIRE(first.size() + second.size() == original.size());
+}
+
 //! Functor with N dummy iterations in it`s body
 class DummyBody {
     int m_numIters;
diff --git a/test/conformance/conformance_blocked_range.cpp b/test/conformance/conformance_blocked_range.cpp
index fbaf629eb7..a6273a6b6d 100644
--- a/test/conformance/conformance_blocked_range.cpp
+++ b/test/conformance/conformance_blocked_range.cpp
@@ -108,7 +108,6 @@ void ParallelTest() {
     }
 }
 
-
 //! Testing blocked_range interface
 //! \brief \ref interface \ref requirement
 TEST_CASE("Basic serial") {
@@ -124,6 +123,26 @@ TEST_CASE("Basic parallel") {
     }
 }
 
+//! Testing blocked_range with proportional splitting
+//! \brief \ref interface \ref requirement
+TEST_CASE("blocked_range proportional splitting") {
+    tbb::blocked_range<int> original(0, 100);
+    tbb::blocked_range<int> first(original);
+    tbb::proportional_split ps(3, 1);
+    tbb::blocked_range<int> second(first, ps);
+
+    // Test proportional_split -> split conversion
+    tbb::blocked_range<int> copy(original);
+    tbb::split s = tbb::split(ps);
+    tbb::blocked_range<int> splitted_copy(copy, s);
+    CHECK(copy.size() == original.size() / 2);
+    CHECK(splitted_copy.size() == copy.size());
+
+
+    int expected_first_end = original.begin() + ps.left() * (original.end() - original.begin()) / (ps.left() + ps.right());
+    utils::check_range_bounds_after_splitting(original, first, second, expected_first_end);
+}
+
 #if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
 //! Testing blocked_range deduction guides
 //! \brief \ref interface
diff --git a/test/conformance/conformance_blocked_range2d.cpp b/test/conformance/conformance_blocked_range2d.cpp
index 2c56e471ba..d041b4ea2f 100644
--- a/test/conformance/conformance_blocked_range2d.cpp
+++ b/test/conformance/conformance_blocked_range2d.cpp
@@ -150,6 +150,24 @@ TEST_CASE("Parallel test") {
     }
 }
 
+//! Testing blocked_range2d with proportional splitting
+//! \brief \ref interface \ref requirement
+TEST_CASE("blocked_range2d proportional splitting") {
+    tbb::blocked_range2d<int> original(0, 100, 0, 100);
+    tbb::blocked_range2d<int> first(original);
+    tbb::proportional_split ps(3, 1);
+    tbb::blocked_range2d<int> second(first, ps);
+
+    int expected_first_end = original.rows().begin() + ps.left() * (original.rows().end() - original.rows().begin()) / (ps.left() + ps.right());
+    if (first.rows().size() == second.rows().size()) {
+        // Splitting was made by cols
+        utils::check_range_bounds_after_splitting(original.cols(), first.cols(), second.cols(), expected_first_end);
+    } else {
+        // Splitting was made by rows
+        utils::check_range_bounds_after_splitting(original.rows(), first.rows(), second.rows(), expected_first_end);
+    }
+}
+
 #if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
 //! Testing blocked_range2d deduction guides
 //! \brief \ref interface
diff --git a/test/conformance/conformance_blocked_range3d.cpp b/test/conformance/conformance_blocked_range3d.cpp
index 5aa63479d2..d2780b9696 100644
--- a/test/conformance/conformance_blocked_range3d.cpp
+++ b/test/conformance/conformance_blocked_range3d.cpp
@@ -183,6 +183,29 @@ TEST_CASE("Parallel test") {
     }
 }
 
+//! Testing blocked_range3d with proportional splitting
+//! \brief \ref interface \ref requirement
+TEST_CASE("blocked_range3d proportional splitting") {
+    tbb::blocked_range3d<int> original(0, 100, 0, 100, 0, 100);
+    tbb::blocked_range3d<int> first(original);
+    tbb::proportional_split ps(3, 1);
+    tbb::blocked_range3d<int> second(first, ps);
+
+    int expected_first_end = original.rows().begin() + ps.left() * (original.rows().end() - original.rows().begin()) / (ps.left() + ps.right());
+    if (first.rows().size() == second.rows().size()) {
+        if (first.cols().size() == second.cols().size()) {
+            // Splitting was made by pages
+            utils::check_range_bounds_after_splitting(original.pages(), first.pages(), second.pages(), expected_first_end);
+        } else {
+            // Splitting was made by cols
+            utils::check_range_bounds_after_splitting(original.cols(), first.cols(), second.cols(), expected_first_end);
+        }
+    } else {
+        // Splitting was made by rows
+        utils::check_range_bounds_after_splitting(original.rows(), first.rows(), second.rows(), expected_first_end);
+    }
+}
+
 #if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
 //! Testing blocked_range3d deduction guides
 //! \brief \ref interface
diff --git a/test/conformance/conformance_blocked_rangeNd.cpp b/test/conformance/conformance_blocked_rangeNd.cpp
index 99f065276a..552bcfbd9f 100644
--- a/test/conformance/conformance_blocked_rangeNd.cpp
+++ b/test/conformance/conformance_blocked_rangeNd.cpp
@@ -254,3 +254,20 @@ TEST_CASE("Parallel test") {
     }
 }
 
+//! Testing blocked_rangeNd with proportional splitting
+//! \brief \ref interface \ref requirement
+TEST_CASE("blocked_rangeNd proportional splitting") {
+    tbb::blocked_rangeNd<int, 2> original{{0, 100}, {0, 100}};
+    tbb::blocked_rangeNd<int, 2> first(original);
+    tbb::proportional_split ps(3, 1);
+    tbb::blocked_rangeNd<int, 2> second(first, ps);
+
+    int expected_first_end = original.dim(0).begin() + ps.left() * (original.dim(0).end() - original.dim(0).begin()) / (ps.left() + ps.right());
+    if (first.dim(0).size() == second.dim(0).size()) {
+        // Splitting was made by cols
+        utils::check_range_bounds_after_splitting(original.dim(1), first.dim(1), second.dim(1), expected_first_end);
+    } else {
+        // Splitting was made by rows
+        utils::check_range_bounds_after_splitting(original.dim(0), first.dim(0), second.dim(0), expected_first_end);
+    }
+}
diff --git a/test/conformance/conformance_combinable.cpp b/test/conformance/conformance_combinable.cpp
index 7232a1b9eb..4d54fcc9ab 100644
--- a/test/conformance/conformance_combinable.cpp
+++ b/test/conformance/conformance_combinable.cpp
@@ -203,6 +203,12 @@ void RunParallelScalarTests(const char* /* test_name */) {
             combine_sum += sums.combine(my_combine<T>);
             combine_ref_sum += sums.combine(my_combine_ref<T>);
 
+            // test combinable::clear()
+            tbb::combinable<T> sums_to_clear;
+            tbb::parallel_for( tbb::blocked_range<int>(0, N, 10000), ParallelScalarBody<T>(sums_to_clear) );
+            sums_to_clear.clear();
+            CHECK_MESSAGE(sums_to_clear.combine(my_combine<T>) == 0, "Failed combinable::clear test");
+
             // test parallel combinable preinitialized with a functor that returns 0
             FunctorAddFinit<T> my_finit_decl;
             tbb::combinable<T> finit_combinable(my_finit_decl);
diff --git a/test/conformance/conformance_concurrent_priority_queue.cpp b/test/conformance/conformance_concurrent_priority_queue.cpp
index 193e7d42c1..2f190c5d07 100644
--- a/test/conformance/conformance_concurrent_priority_queue.cpp
+++ b/test/conformance/conformance_concurrent_priority_queue.cpp
@@ -171,6 +171,7 @@ void test_steal_move_ctor() {
     container_type dst{std::move(fixture.cpq_src)};
     REQUIRE_MESSAGE(previous == MoveOperationTracker::special_member_calls(), "Steal move ctor should not create any new elements");
     REQUIRE_MESSAGE(dst == src_copy, "cpq content changed during steal move");
+    REQUIRE_MESSAGE(!(dst != src_copy), "cpq content changed during steal move");
 }
 
 void test_steal_move_ctor_with_allocator() {
@@ -185,6 +186,7 @@ void test_steal_move_ctor_with_allocator() {
     fixture_type::cpq_type dst(std::move(fixture.cpq_src), arena_fixture.source_allocator);
     REQUIRE_MESSAGE(previous == MoveOperationTracker::special_member_calls(), "Steal move ctor should not create any new elements");
     REQUIRE_MESSAGE(dst == src_copy, "cpq content changed during steal move");
+    REQUIRE_MESSAGE(!(dst != src_copy), "cpq content changed during steal move");
 }
 
 void test_per_element_move_ctor_with_allocator() {
@@ -202,6 +204,7 @@ void test_per_element_move_ctor_with_allocator() {
     REQUIRE_MESSAGE(move_ctor_called_cpq_size_times == MoveOperationTracker::special_member_calls(),
                     "Per element move ctor should move initialize all new elements");
     REQUIRE_MESSAGE(dst == src_copy, "cpq content changed during move");
+    REQUIRE_MESSAGE(!(dst != src_copy), "cpq content changed during move");
 }
 
 void test_steal_move_assign_operator() {
@@ -216,6 +219,7 @@ void test_steal_move_assign_operator() {
 
     REQUIRE_MESSAGE(previous == MoveOperationTracker::special_member_calls(), "Steal move assign operator should not create any new elements");
     REQUIRE_MESSAGE(dst == src_copy, "cpq content changed during steal move assignment");
+    REQUIRE_MESSAGE(!(dst != src_copy), "cpq content changed during steal move assignment");
 }
 
 void test_steal_move_assign_operator_with_stateful_allocator() {
@@ -232,6 +236,7 @@ void test_steal_move_assign_operator_with_stateful_allocator() {
     dst = std::move(fixture.cpq_src);
     REQUIRE_MESSAGE(previous == MoveOperationTracker::special_member_calls(), "Steal move assign operator should not create any new elements");
     REQUIRE_MESSAGE(dst == src_copy, "cpq content changed during steal move assignment");
+    REQUIRE_MESSAGE(!(dst != src_copy), "cpq content changed during steal move assignment");
 }
 
 void test_per_element_move_assign_operator() {
@@ -250,6 +255,7 @@ void test_per_element_move_assign_operator() {
     REQUIRE_MESSAGE(move_ctor_called_cpq_size_times == MoveOperationTracker::special_member_calls(),
                     "Per element move assignment should move initialize all new elements");
     REQUIRE_MESSAGE(dst == src_copy, "cpq content changed during per element move assignment");
+    REQUIRE_MESSAGE(!(dst != src_copy), "cpq content changed during per element move assignment");
 }
 
 void test_cpq_move_constructor() {
@@ -450,6 +456,7 @@ void test_assignment_clear_swap() {
     REQUIRE_MESSAGE(!qo.empty(), "Failed assignment empty test");
     REQUIRE_MESSAGE(v == toVector(qo), "Failed assignment equality test");
     REQUIRE_MESSAGE(qo == q, "Failed assignment equality test");
+    REQUIRE_MESSAGE(!(qo != q), "Failed assignment inequality test");
 
     cpq_type assigned_q;
     // Testing assign member function
diff --git a/test/conformance/conformance_enumerable_thread_specific.cpp b/test/conformance/conformance_enumerable_thread_specific.cpp
index bbfd18f969..5660591447 100644
--- a/test/conformance/conformance_enumerable_thread_specific.cpp
+++ b/test/conformance/conformance_enumerable_thread_specific.cpp
@@ -86,6 +86,8 @@ class minimal: utils::NoAssign {
     ~minimal() { ++destruction_counter; REQUIRE(is_constructed); is_constructed = false; }
     void set_value( const int i ) { REQUIRE(is_constructed); my_value = i; }
     int value( ) const { REQUIRE(is_constructed); return my_value; }
+
+    bool operator==( const minimal& other ) { return my_value == other.my_value; }
 };
 
 static size_t AlignMask = 0;  // set to cache-line-size - 1
@@ -135,6 +137,8 @@ class ThrowingConstructor {
 public:
     int m_cnt;
     ThrowingConstructor() : m_checktype(), m_throwing_field() { m_cnt = 0;}
+
+    bool operator==( const ThrowingConstructor& other ) { return m_cnt == other.m_cnt; }
 private:
 };
 
@@ -400,7 +404,8 @@ void run_parallel_scalar_tests_nocombine(const char* /* test_name */, const char
                 }
 
                 // use const_range_type
-                typename ets_type::const_range_type cr = sums.range();
+                const ets_type& csums = sums;
+                typename ets_type::const_range_type cr = csums.range();
                 for ( typename ets_type::const_range_type::iterator i = cr.begin(); i != cr.end(); ++i ) {
                      test_helper<T>::sum(const_range_sum, *i);
                 }
@@ -409,6 +414,15 @@ void run_parallel_scalar_tests_nocombine(const char* /* test_name */, const char
                 typedef typename tbb::enumerable_thread_specific<T, Allocator<T>, tbb::ets_key_per_instance> cached_ets_type;
 
                 cached_ets_type cconst(sums);
+                tbb::parallel_for( tbb::blocked_range<int>(0, N, RANGE_MIN), [&]( const tbb::blocked_range<int>& ) {
+                    bool exists = false;
+                    T& ref = cconst.local(exists);
+                    CHECK((exists || ref == T()));
+                } );
+                cached_ets_type cconst_to_assign1 = cconst;
+                cached_ets_type cconst_to_assign2;
+                cconst_to_assign2 = std::move(cconst_to_assign1);
+                REQUIRE(cconst_to_assign2.size() == cconst.size());
 
                 for ( typename cached_ets_type::const_iterator i = cconst.begin(); i != cconst.end(); ++i ) {
                      test_helper<T>::sum(cconst_sum, *i);
@@ -668,6 +682,11 @@ void run_parallel_vector_tests(const char* /* test_name */, const char *allocato
             auto it2(it);
             it = fvs.begin();
             REQUIRE(it != it2);
+            typename tbb::flattened2d<ets_type>::iterator it3;
+            typename tbb::flattened2d<ets_type>::const_iterator cit = fvs.begin();
+            it3 = cit;
+            REQUIRE(it3 == cit);
+            REQUIRE(it3.operator->() == &(*it3));
 
             for(typename tbb::flattened2d<ets_type>::const_iterator i = fvs.begin(); i != fvs.end(); ++i) {
                 ++elem_cnt;
diff --git a/test/conformance/conformance_graph.cpp b/test/conformance/conformance_graph.cpp
index 426411d0b1..5784cc1471 100644
--- a/test/conformance/conformance_graph.cpp
+++ b/test/conformance/conformance_graph.cpp
@@ -32,16 +32,6 @@
 using namespace tbb::flow;
 using namespace std;
 
-//! const graph
-//! \brief \ref error_guessing
-TEST_CASE("const graph"){
-    const graph g;
-    CHECK_MESSAGE((g.cbegin() == g.cend()), "Starting graph is empty");
-
-    graph g2;
-    CHECK_MESSAGE((g2.begin() == g2.end()), "Starting graph is empty");
-}
-
 //! Graph reset
 //! \brief \ref requirement
 TEST_CASE("graph reset") {
@@ -144,4 +134,3 @@ TEST_CASE("graph reset") {
     // TODO: Add check that default invocaiton is the same as with rf_reset_protocol
     // TODO: See if specification for broadcast_node and other service nodes is sufficient for reset checks
 }
-
diff --git a/test/conformance/conformance_mutex.cpp b/test/conformance/conformance_mutex.cpp
index 3beb1d137a..a7fc350c23 100644
--- a/test/conformance/conformance_mutex.cpp
+++ b/test/conformance/conformance_mutex.cpp
@@ -86,6 +86,15 @@ void TestTryAcquire(const char* mutex_name) {
     }
 }
 
+template <>
+void TestTryAcquire<tbb::null_mutex>( const char* mutex_name ) {
+    tbb::null_mutex tested_mutex;
+    typename tbb::null_mutex::scoped_lock lock(tested_mutex);
+    CHECK_MESSAGE(lock.try_acquire(tested_mutex), "ERROR for " << mutex_name << ": try_acquire failed though it should not");
+    lock.release();
+    CHECK_MESSAGE(lock.try_acquire(tested_mutex), "ERROR for " << mutex_name << ": try_acquire failed though it should not");
+}
+
 //! Test try_acquire functionality of a non-reenterable mutex
 template<typename M>
 void TestTryAcquireReader(const char* mutex_name) {
@@ -112,6 +121,17 @@ void TestTryAcquireReader(const char* mutex_name) {
     }
 }
 
+template <>
+void TestTryAcquireReader<tbb::null_rw_mutex>( const char* mutex_name ) {
+    tbb::null_rw_mutex tested_mutex;
+    typename tbb::null_rw_mutex::scoped_lock lock(tested_mutex, false);
+    CHECK_MESSAGE(lock.try_acquire(tested_mutex, false), "Error for " << mutex_name << ": try_acquire on read failed though it should not");
+    CHECK_MESSAGE(lock.try_acquire(tested_mutex, true), "Error for " << mutex_name << ": try_acquire on write failed though it should not");
+    lock.release();
+    CHECK_MESSAGE(lock.try_acquire(tested_mutex, false), "Error for " << mutex_name << ": try_acquire on read failed though it should not");
+    CHECK_MESSAGE(lock.try_acquire(tested_mutex, true), "Error for " << mutex_name << ": try_acquire on write failed though it should not");
+}
+
 template<typename M, size_t N>
 struct ArrayCounter {
     using mutex_type = M;
@@ -417,6 +437,7 @@ TEST_CASE("Lockable requirement test") {
     TestTryAcquire<tbb::queuing_rw_mutex>("Queuing RW Mutex");
     TestTryAcquire<tbb::speculative_spin_mutex>("Speculative Spin Mutex");
     TestTryAcquire<tbb::speculative_spin_rw_mutex>("Speculative Spin RW Mutex");
+    TestTryAcquire<tbb::null_mutex>("Null Mutex");
 }
 
 //! Testing ReaderWriterMutex requirements
@@ -434,6 +455,7 @@ TEST_CASE("Shared mutexes (reader/writer) test") {
     TestRWStateMultipleChange<tbb::queuing_rw_mutex>("Queuing RW Mutex");
     TestTryAcquireReader<tbb::speculative_spin_rw_mutex>("Speculative Spin RW Mutex");
     TestRWStateMultipleChange<tbb::speculative_spin_rw_mutex>("Speculative Spin RW Mutex");
+    TestTryAcquireReader<tbb::null_rw_mutex>("Null RW Mutex");
 }
 
 //! Testing ISO C++ Mutex and Shared Mutex requirements.
diff --git a/test/conformance/conformance_parallel_for.cpp b/test/conformance/conformance_parallel_for.cpp
index 7797cffba5..150a47ea0e 100644
--- a/test/conformance/conformance_parallel_for.cpp
+++ b/test/conformance/conformance_parallel_for.cpp
@@ -232,6 +232,7 @@ void TestParallelForWithStepSupport() {
     static tbb::affinity_partitioner affinity_p;
     tbb::auto_partitioner auto_p;
     tbb::simple_partitioner simple_p;
+    tbb::static_partitioner static_p;
     empty_partitioner_tag p;
 
     // Try out all partitioner combinations
@@ -239,6 +240,7 @@ void TestParallelForWithStepSupport() {
     TestParallelForWithStepSupportHelper< Flavor,T,const tbb::auto_partitioner >(auto_p);
     TestParallelForWithStepSupportHelper< Flavor,T,const tbb::simple_partitioner >(simple_p);
     TestParallelForWithStepSupportHelper< Flavor,T,tbb::affinity_partitioner >(affinity_p);
+    TestParallelForWithStepSupportHelper< Flavor,T,tbb::static_partitioner >(static_p);
 
     // Testing some corner cases
     tbb::parallel_for(static_cast<T>(2), static_cast<T>(1), static_cast<T>(1), TestFunctor<T>());
@@ -309,4 +311,7 @@ TEST_CASE("Testing parallel_for with partitioners") {
 
     parallel_for(Range1(false, true), b, tbb::auto_partitioner());
     parallel_for(Range6(false, true), b, tbb::auto_partitioner());
+
+    parallel_for(Range1(true, false), b, tbb::static_partitioner());
+    parallel_for(Range6(false, true), b, tbb::static_partitioner());
 }
diff --git a/test/conformance/conformance_task_arena.cpp b/test/conformance/conformance_task_arena.cpp
index 15b0c42823..466a622cb9 100644
--- a/test/conformance/conformance_task_arena.cpp
+++ b/test/conformance/conformance_task_arena.cpp
@@ -96,3 +96,12 @@ TEST_CASE("Task arena observer") {
     REQUIRE(observer.is_callbacks_called());
 }
 
+//! Test task arena copy constructor
+//! \brief \ref interface \ref requirement
+TEST_CASE("Task arena copy constructor") {
+    tbb::task_arena arena(1);
+    tbb::task_arena copy = arena;
+
+    REQUIRE(arena.max_concurrency() == copy.max_concurrency());
+    REQUIRE(arena.is_active() == copy.is_active());
+}
diff --git a/test/tbb/test_arena_priorities.cpp b/test/tbb/test_arena_priorities.cpp
index 405346dd37..5095e094dd 100644
--- a/test/tbb/test_arena_priorities.cpp
+++ b/test/tbb/test_arena_priorities.cpp
@@ -14,9 +14,6 @@
     limitations under the License.
 */
 
-
-#define TBB_PREVIEW_NUMA_SUPPORT __TBB_CPF_BUILD
-
 #include "common/test.h"
 
 #include "tbb/task_group.h"
@@ -88,9 +85,12 @@ tbb::task_arena* do_allocate_and_construct( const ArenaArgs&... arena_args )
         break;
 
     case explicit_initialize_with_different_constructor_parameters:
-        result_arena = new tbb::task_arena( dummy_max_concurrency, dummy_reserved_for_masters );
-        result_arena->initialize( arena_args... );
+    {
+        tbb::task_arena tmp(dummy_max_concurrency, dummy_reserved_for_masters);
+        result_arena = new tbb::task_arena(tmp);
+        result_arena->initialize(arena_args...);
         break;
+    }
 
     default:
         REQUIRE_MESSAGE( false, "Not implemented method of initialization." );
@@ -124,7 +124,6 @@ tbb::task_arena* allocate_and_construct_arena(
 {
     const int reserved_for_masters = 0;
 
-#if TBB_PREVIEW_NUMA_SUPPORT
     static bool use_constraints = false;
     use_constraints = !use_constraints;
 
@@ -132,7 +131,6 @@ tbb::task_arena* allocate_and_construct_arena(
         tbb::task_arena::constraints properties{tbb::task_arena::automatic, arena_max_concurrency};
         return decide_on_arguments( properties, reserved_for_masters, a_priority );
     }
-#endif
 
     return decide_on_arguments( arena_max_concurrency, reserved_for_masters, a_priority );
 }
diff --git a/test/tbb/test_async_node.cpp b/test/tbb/test_async_node.cpp
index 3346fc8f20..bcbb751bda 100644
--- a/test/tbb/test_async_node.cpp
+++ b/test/tbb/test_async_node.cpp
@@ -725,7 +725,7 @@ void test_follows() {
 
     node_t node(follows(preds[0], preds[1], preds[2]), unlimited, [&](int input, node_t::gateway_type& gtw) {
         async_activity.submit(input, &gtw);
-    });
+    }, no_priority);
 
     buffer_node<output_t> buf(g);
     make_edge(node, buf);
@@ -759,7 +759,7 @@ void test_precedes() {
 
     node_t node(precedes(successors[0]), unlimited, [&](int input, node_t::gateway_type& gtw) {
         async_activity.submit(input, &gtw);
-    });
+    }, no_priority);
 
     make_edge(start, node);
 
diff --git a/test/tbb/test_broadcast_node.cpp b/test/tbb/test_broadcast_node.cpp
index f7dadeb0e3..626ac151f7 100644
--- a/test/tbb/test_broadcast_node.cpp
+++ b/test/tbb/test_broadcast_node.cpp
@@ -76,9 +76,6 @@ class counting_array_receiver : public tbb::flow::receiver<T> {
     tbb::flow::graph& graph_reference() const override {
         return my_graph;
     }
-
-    void reset_receiver(tbb::flow::reset_flags /*f*/) override { }
-
 };
 
 template< typename T >
diff --git a/test/tbb/test_composite_node.cpp b/test/tbb/test_composite_node.cpp
index e7bfedd15e..671fc87418 100644
--- a/test/tbb/test_composite_node.cpp
+++ b/test/tbb/test_composite_node.cpp
@@ -159,6 +159,9 @@ void add_all_nodes (){
     output_only_type c_node(g);
     c_node.set_external_ports(output_tuple);
 
+    // Reset is not suppose to do anything. Check that it can be called.
+    g.reset();
+
     c_node.add_visible_nodes(src, fxn, m_fxn, bc, lim, ind, s, ct, j, q, bf, pq, wo, ovw, seq);
 
     c_node.add_nodes(src, fxn, m_fxn, bc, lim, ind, s, ct, j, q, bf, pq, wo, ovw, seq);
@@ -318,10 +321,10 @@ int test_adder(bool hidden = false) {
     int sum_total=0;
     int result=0;
     for ( int i = 1; i < 4; ++i ) {
-       s.try_put(i);
-       c.try_put(i);
-       sum_total += adder_sum(i);
-    g.wait_for_all();
+        s.try_put(i);
+        c.try_put(i);
+        sum_total += adder_sum(i);
+        g.wait_for_all();
     }
 
     int j;
@@ -342,10 +345,10 @@ int test_adder(bool hidden = false) {
     sum_total=0;
     result=0;
     for ( int i = 10; i < 20; ++i ) {
-       s.try_put(i);
-       c.try_put(i);
-       sum_total += adder_sum(i);
-    g.wait_for_all();
+        s.try_put(i);
+        c.try_put(i);
+        sum_total += adder_sum(i);
+        g.wait_for_all();
     }
 
     for ( int i = 10; i < 20; ++i ) {
diff --git a/test/tbb/test_concurrent_hash_map.cpp b/test/tbb/test_concurrent_hash_map.cpp
index a28f9cdee3..e263fc4754 100644
--- a/test/tbb/test_concurrent_hash_map.cpp
+++ b/test/tbb/test_concurrent_hash_map.cpp
@@ -557,7 +557,8 @@ TEST_CASE("Test exception in constructors") {
     using allocator_type = StaticSharedCountingAllocator<std::allocator<std::pair<const int, int>>>;
     using map_type = tbb::concurrent_hash_map<int, int, tbb::tbb_hash_compare<int>, allocator_type>;
 
-    auto init_list = {std::pair<const int, int>(1, 42), std::pair<const int, int>(2, 42), std::pair<const int, int>(3, 42)};
+    auto init_list = {std::pair<const int, int>(1, 42), std::pair<const int, int>(2, 42), std::pair<const int, int>(3, 42),
+        std::pair<const int, int>(4, 42), std::pair<const int, int>(5, 42), std::pair<const int, int>(6, 42)};
     map_type map(init_list);
 
     allocator_type::set_limits(1);
@@ -582,6 +583,24 @@ TEST_CASE("Test exception in constructors") {
         map_type map4(init_list, test_hash);
         utils::suppress_unused_warning(map4);
     }(), const std::bad_alloc);
+
+    REQUIRE_THROWS_AS( [&] {
+        map_type map5(init_list);
+        utils::suppress_unused_warning(map5);
+    }(), const std::bad_alloc);
+
+    allocator_type::set_limits(0);
+    map_type big_map{};
+    for (std::size_t i = 0; i < 1000; ++i) {
+        big_map.insert(std::pair<const int, int>(i, 42));
+    }
+
+    allocator_type::init_counters();
+    allocator_type::set_limits(300);
+    REQUIRE_THROWS_AS( [&] {
+        map_type map6(big_map);
+        utils::suppress_unused_warning(map6);
+    }(), const std::bad_alloc);
 }
 #endif // TBB_USE_EXCEPTIONS
 
diff --git a/test/tbb/test_concurrent_map.cpp b/test/tbb/test_concurrent_map.cpp
index 1f48d9d27a..f2dd9ab688 100644
--- a/test/tbb/test_concurrent_map.cpp
+++ b/test/tbb/test_concurrent_map.cpp
@@ -217,3 +217,31 @@ TEST_CASE("concurrent_map/multimap with specific key/mapped types") {
 TEST_CASE("broken internal structure for multimap") {
     test_cycles_absense();
 }
+
+//! \brief \ref error_guessing
+TEST_CASE("concurrent_map::swap with not always equal allocator") {
+    using not_always_equal_alloc_map_type = tbb::concurrent_map<int, int, std::less<int>,
+                                                                NotAlwaysEqualAllocator<std::pair<const int, int>>>;
+    test_swap_not_always_equal_allocator<not_always_equal_alloc_map_type>();
+}
+
+//! \brief \ref error_guessing
+TEST_CASE("concurrent_multimap::swap with not always equal allocator") {
+    using not_always_equal_alloc_mmap_type = tbb::concurrent_multimap<int, int, std::less<int>,
+                                                                      NotAlwaysEqualAllocator<std::pair<const int, int>>>;
+    test_swap_not_always_equal_allocator<not_always_equal_alloc_mmap_type>();
+}
+
+#if TBB_USE_EXCEPTIONS
+//! \brief \ref error_guessing
+TEST_CASE("concurrent_map throwing copy constructor") {
+    using exception_map_type = tbb::concurrent_map<ThrowOnCopy, ThrowOnCopy>;
+    test_exception_on_copy_ctor<exception_map_type>();
+}
+
+//! \brief \ref error_guessing
+TEST_CASE("concurrent_multimap throwing copy constructor") {
+    using exception_mmap_type = tbb::concurrent_multimap<ThrowOnCopy, ThrowOnCopy>;
+    test_exception_on_copy_ctor<exception_mmap_type>();
+}
+#endif // TBB_USE_EXCEPTIONS
diff --git a/test/tbb/test_concurrent_queue.cpp b/test/tbb/test_concurrent_queue.cpp
index 718d04a3fb..18f1dc86d5 100644
--- a/test/tbb/test_concurrent_queue.cpp
+++ b/test/tbb/test_concurrent_queue.cpp
@@ -114,6 +114,43 @@ void TestQueueWorksWithSSE() {
     }
 #endif /* HAVE_m256 */
 }
+#if TBB_USE_EXCEPTIONS
+    int rnd_elem = -1;
+    int global_counter = -1;
+
+struct throw_element {
+    throw_element() = default;
+    throw_element(const throw_element&) {
+        if (global_counter++ == rnd_elem) {
+            throw std::exception{};
+        }
+    }
+
+    throw_element& operator= (const throw_element&) = default;
+};
+
+template <typename Queue>
+void CopyWithThrowElement() {
+    utils::FastRandom<> rnd(42);
+
+    Queue source;
+
+    constexpr size_t queue_size = 100000;
+    for (std::size_t i = 0; i < queue_size; ++i) {
+        source.emplace();
+    }
+
+    for (std::size_t i = 0; i < 100; ++i) {
+        global_counter = 0;
+        rnd_elem = rnd.get() % queue_size;
+
+        REQUIRE_THROWS_AS( [&] {
+            Queue copy(source);
+            utils::suppress_unused_warning(copy);
+        }(), std::exception);
+    }
+}
+#endif // TBB_USE_EXCEPTIONS
 
 //! Test work with different fypes
 //! \brief \ref error_guessing
@@ -154,4 +191,11 @@ TEST_CASE("Test exception in allocation") {
         }(), const std::bad_alloc);
     }
 }
+
+//! \brief \ref regression \ref error_guessing
+TEST_CASE("Test exception in allocation") {
+    CopyWithThrowElement<tbb::concurrent_queue<throw_element>>();
+    CopyWithThrowElement<tbb::concurrent_bounded_queue<throw_element>>();
+}
+
 #endif // TBB_USE_EXCEPTIONS
diff --git a/test/tbb/test_concurrent_set.cpp b/test/tbb/test_concurrent_set.cpp
index 8be30dda40..d38defa29e 100644
--- a/test/tbb/test_concurrent_set.cpp
+++ b/test/tbb/test_concurrent_set.cpp
@@ -204,6 +204,32 @@ TEST_CASE("concurrent_multiset with std::scoped_allocator_adaptor") {
 }
 
 //! \brief \ref regression
-TEST_CASE("broken internal structure for multimap") {
+TEST_CASE("broken internal structure for multiset") {
     test_cycles_absense();
 }
+
+//! \brief \ref error_guessing
+TEST_CASE("concurrent_set::swap with not always equal allocator") {
+    using not_always_equal_alloc_set_type = tbb::concurrent_set<int, std::less<int>, NotAlwaysEqualAllocator<int>>;
+    test_swap_not_always_equal_allocator<not_always_equal_alloc_set_type>();
+}
+
+//! \brief \ref error_guessing
+TEST_CASE("concurrent_multiset::swap with not always equal allocator") {
+    using not_always_equal_alloc_mset_type = tbb::concurrent_multiset<int, std::less<int>, NotAlwaysEqualAllocator<int>>;
+    test_swap_not_always_equal_allocator<not_always_equal_alloc_mset_type>();
+}
+
+#if TBB_USE_EXCEPTIONS
+//! \brief \ref error_guessing
+TEST_CASE("concurrent_set throwing copy constructor") {
+    using exception_set_type = tbb::concurrent_set<ThrowOnCopy>;
+    test_exception_on_copy_ctor<exception_set_type>();
+}
+
+//! \brief \ref error_guessing
+TEST_CASE("concurrent_multiset throwing copy constructor") {
+    using exception_mset_type = tbb::concurrent_multiset<ThrowOnCopy>;
+    test_exception_on_copy_ctor<exception_mset_type>();
+}
+#endif // TBB_USE_EXCEPTIONS
diff --git a/test/tbb/test_concurrent_unordered_map.cpp b/test/tbb/test_concurrent_unordered_map.cpp
index daae51f186..0d68a61800 100644
--- a/test/tbb/test_concurrent_unordered_map.cpp
+++ b/test/tbb/test_concurrent_unordered_map.cpp
@@ -193,4 +193,50 @@ TEST_CASE("concurrent_unordered map/multimap with specific key/mapped types") {
     test_specific_types();
 }
 
+//! \brief \ref error_guessing
+TEST_CASE("concurrent_unordered_map::swap with not always equal allocator") {
+    using not_always_equal_alloc_map_type = tbb::concurrent_unordered_map<int, int, std::hash<int>, std::equal_to<int>,
+                                                                          NotAlwaysEqualAllocator<std::pair<const int, int>>>;
+    test_swap_not_always_equal_allocator<not_always_equal_alloc_map_type>();
+}
+
+//! \brief \ref error_guessing
+TEST_CASE("concurrent_unordered_multimap::swap with not always equal allocator") {
+    using not_always_equal_alloc_mmap_type = tbb::concurrent_unordered_multimap<int, int, std::hash<int>, std::equal_to<int>,
+                                                                                NotAlwaysEqualAllocator<std::pair<const int, int>>>;
+    test_swap_not_always_equal_allocator<not_always_equal_alloc_mmap_type>();
+}
+
+#if TBB_USE_EXCEPTIONS
+//! \brief \ref error_guessing
+TEST_CASE("concurrent_unordered_map throwing copy constructor") {
+    using exception_map_type = tbb::concurrent_unordered_map<ThrowOnCopy, ThrowOnCopy>;
+    test_exception_on_copy_ctor<exception_map_type>();
+}
+
+//! \brief \ref error_guessing
+TEST_CASE("concurrent_unordered_multimap throwing copy constructor") {
+    using exception_mmap_type = tbb::concurrent_unordered_multimap<ThrowOnCopy, ThrowOnCopy>;
+    test_exception_on_copy_ctor<exception_mmap_type>();
+}
+
+//! \brief \ref error_guessing
+TEST_CASE("concurrent_unordered_map whitebox throwing copy constructor") {
+    using allocator_type = StaticSharedCountingAllocator<std::allocator<std::pair<const int, int>>>;
+    using exception_mmap_type = tbb::concurrent_unordered_map<int, int, std::hash<int>, std::equal_to<int>, allocator_type>;
+
+    exception_mmap_type map;
+    for (std::size_t i = 0; i < 10; ++i) {
+        map.insert(std::pair<const int, int>(i, 42));
+    }
+
+    allocator_type::set_limits(1);
+    REQUIRE_THROWS_AS( [&] {
+        exception_mmap_type map1(map);
+        utils::suppress_unused_warning(map1);
+    }(), const std::bad_alloc);
+}
+
+#endif // TBB_USE_EXCEPTIONS
+
 // TODO: add test_scoped_allocator support with broken macro
diff --git a/test/tbb/test_concurrent_unordered_set.cpp b/test/tbb/test_concurrent_unordered_set.cpp
index 5ca9b0df67..36e128116c 100644
--- a/test/tbb/test_concurrent_unordered_set.cpp
+++ b/test/tbb/test_concurrent_unordered_set.cpp
@@ -183,3 +183,31 @@ TEST_CASE("concurrent_unordered_set with std::scoped_allocator_adaptor") {
 TEST_CASE("concurrent_unordered_multiset with std::scoped_allocator_adaptor") {
     test_scoped_allocator<CumultisetTraits>();
 }
+
+//! \brief \ref error_guessing
+TEST_CASE("concurrent_unordered_set::swap with not always equal allocator") {
+    using not_always_equal_alloc_set_type = tbb::concurrent_unordered_set<int, std::hash<int>, std::equal_to<int>,
+                                                                          NotAlwaysEqualAllocator<int>>;
+    test_swap_not_always_equal_allocator<not_always_equal_alloc_set_type>();
+}
+
+//! \brief \ref error_guessing
+TEST_CASE("concurrent_unordered_multiset::swap with not always equal allocator") {
+    using not_always_equal_alloc_mset_type = tbb::concurrent_unordered_multiset<int, std::hash<int>, std::equal_to<int>,
+                                                                                NotAlwaysEqualAllocator<int>>;
+    test_swap_not_always_equal_allocator<not_always_equal_alloc_mset_type>();
+}
+
+#if __TBB_USE_EXCEPTIONS
+//! \brief \ref error_guessing
+TEST_CASE("concurrent_unordered_set throwing copy constructor") {
+    using exception_set_type = tbb::concurrent_unordered_set<ThrowOnCopy>;
+    test_exception_on_copy_ctor<exception_set_type>();
+}
+
+//! \brief \ref error_guessing
+TEST_CASE("concurrent_unordered_multimap throwing copy constructor") {
+    using exception_mset_type = tbb::concurrent_unordered_multiset<ThrowOnCopy>;
+    test_exception_on_copy_ctor<exception_mset_type>();
+}
+#endif // __TBB_USE_EXCEPTIONS
diff --git a/test/tbb/test_concurrent_vector.cpp b/test/tbb/test_concurrent_vector.cpp
index fb9f4f6a7e..9a0a328ab5 100644
--- a/test/tbb/test_concurrent_vector.cpp
+++ b/test/tbb/test_concurrent_vector.cpp
@@ -668,7 +668,7 @@ TEST_CASE("Reducing concurrent_vector") {
 
 
 //! \brief \ref error_guessing
-TEST_CASE("swap with NotAlwaysEqualAllocator allocators"){
+TEST_CASE("swap with not always equal allocators"){
     using allocator_type = NotAlwaysEqualAllocator<int>;
     using vector_type = tbb::concurrent_vector<int, allocator_type>;
 
diff --git a/test/tbb/test_continue_node.cpp b/test/tbb/test_continue_node.cpp
index 54abda4817..ca177cfd31 100644
--- a/test/tbb/test_continue_node.cpp
+++ b/test/tbb/test_continue_node.cpp
@@ -305,11 +305,11 @@ void test_follows_and_precedes_api() {
 
     follows_and_precedes_testing::test_follows
         <msg_t, tbb::flow::continue_node<msg_t>>
-        (messages_for_follows, pass_through);
+        (messages_for_follows, pass_through, node_priority_t(0));
 
     follows_and_precedes_testing::test_precedes
         <msg_t, tbb::flow::continue_node<msg_t>>
-        (messages_for_precedes, pass_through);
+        (messages_for_precedes, /* number_of_predecessors = */0, pass_through, node_priority_t(1));
 }
 #endif // __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
 
diff --git a/test/tbb/test_dynamic_link.cpp b/test/tbb/test_dynamic_link.cpp
index 95858b2e5e..2418e1fbe6 100644
--- a/test/tbb/test_dynamic_link.cpp
+++ b/test/tbb/test_dynamic_link.cpp
@@ -57,9 +57,7 @@ static const tbb::detail::r1::dynamic_link_descriptor LinkTable[] = {
 #include "src/tbb/dynamic_link.cpp"
 #include "common/utils_dynamic_libs.h"
 
-//! Testing dynamic_link
-//! \brief \ref error_guessing
-TEST_CASE("Test dynamic_link") {
+void test_dynamic_link(const char* lib_name) {
 #if __TBB_DYNAMIC_LOAD_ENABLED
 #if !_WIN32
     // Check if the executable exports its symbols.
@@ -71,7 +69,7 @@ TEST_CASE("Test dynamic_link") {
     // the dynamic_link call - let it be an empty string.
     // Generally speaking the test has sense only on Linux but on Windows it
     // checks the dynamic_link graceful behavior with incorrect library name.
-    if (tbb::detail::r1::dynamic_link("", LinkTable, sizeof(LinkTable) / sizeof(LinkTable[0]))) {
+    if (tbb::detail::r1::dynamic_link(lib_name, LinkTable, sizeof(LinkTable) / sizeof(LinkTable[0]))) {
         REQUIRE_MESSAGE((foo1_handler && foo2_handler), "The symbols are corrupted by dynamic_link");
         REQUIRE_MESSAGE((foo1_handler() == FOO_IMPLEMENTATION && foo2_handler() == FOO_IMPLEMENTATION),
                 "dynamic_link returned the successful code but symbol(s) are wrong");
@@ -80,3 +78,15 @@ TEST_CASE("Test dynamic_link") {
     }
 #endif
 }
+
+//! Testing dynamic_link with non-existing library
+//! \brief \ref error_guessing
+TEST_CASE("Test dynamic_link with non-existing library") {
+    test_dynamic_link("tbb_unrealNAME.so");
+}
+
+//! Testing dynamic_link
+//! \brief \ref error_guessing
+TEST_CASE("Test dynamic_link") {
+    test_dynamic_link("");
+}
diff --git a/test/tbb/test_eh_algorithms.cpp b/test/tbb/test_eh_algorithms.cpp
index e26c7267f5..7bd2782209 100644
--- a/test/tbb/test_eh_algorithms.cpp
+++ b/test/tbb/test_eh_algorithms.cpp
@@ -1094,6 +1094,7 @@ TEST_CASE("parallel_for_each exception handling test #5") {
                 g_ExceptionInMaster = (j & 1) != 0;
                 g_SolitaryException = (j & 2) != 0;
 
+                Test5_parallel_for_each<utils::InputIterator<size_t> >();
                 Test5_parallel_for_each<utils::ForwardIterator<size_t> >();
                 Test5_parallel_for_each<utils::RandomIterator<size_t> >();
             }
diff --git a/test/tbb/test_flow_graph.cpp b/test/tbb/test_flow_graph.cpp
index 56a03a14fd..9fad662cb0 100644
--- a/test/tbb/test_flow_graph.cpp
+++ b/test/tbb/test_flow_graph.cpp
@@ -339,3 +339,32 @@ TEST_CASE("Test parallel"){
 TEST_CASE("Test graph_arena"){
     test_graph_arena();
 }
+
+//! Graph iterator
+//! \brief \ref error_guessing
+TEST_CASE("graph iterator") {
+    using namespace tbb::flow;
+
+    graph g;
+    
+    auto past_end = g.end();
+    ++past_end;
+
+    continue_node<int> n(g, [](const continue_msg &){return 1;});
+
+    size_t item_count = 0;
+
+    for(auto it = g.cbegin(); it != g.cend(); it++)
+        ++item_count;
+    CHECK_MESSAGE((item_count == 1), "Should find 1 item");
+
+    item_count = 0;
+    auto jt(g.begin());
+    for(; jt != g.end(); jt++)
+        ++item_count;
+    CHECK_MESSAGE((item_count == 1), "Should find 1 item");
+
+    graph g2;
+    continue_node<int> n2(g, [](const continue_msg &){return 1;});
+    CHECK_MESSAGE((g.begin() != g2.begin()), "Different graphs should have different iterators");
+}
diff --git a/test/tbb/test_flow_graph_priorities.cpp b/test/tbb/test_flow_graph_priorities.cpp
index 10e732b4c8..f3216d2461 100644
--- a/test/tbb/test_flow_graph_priorities.cpp
+++ b/test/tbb/test_flow_graph_priorities.cpp
@@ -782,6 +782,38 @@ void test(int num_threads) {
 
 } // namespace ManySuccessors
 
+#if TBB_USE_EXCEPTIONS
+namespace Exceptions {
+    void test() {
+        using namespace tbb::flow;
+        graph g;
+        std::srand(42);
+        continue_node<int> c(g, [](continue_msg) {
+            return std::rand() % 10;
+        }, 2);
+        function_node<int> f(g, unlimited, [](int v) {
+            if (v > 4) {
+                throw std::runtime_error("Exception::test");
+            }
+        }, 1);
+        make_edge(c, f);
+        for (int i = 0; i < 10; ++i) {
+            try {
+                for (int j = 0; j < 50; ++j) {
+                    c.try_put(continue_msg());
+                }
+                g.wait_for_all();
+                FAIL("Unreachable code. The exception is expected");
+            } catch (std::runtime_error&) {
+                CHECK(g.is_cancelled());
+            } catch (...) {
+                FAIL("Unexpected exception");
+            }
+        }
+    }
+} // namespace Exceptions
+#endif
+
 //! Test node prioritization
 //! \brief \ref requirement
 TEST_CASE("Priority nodes take precedence"){
@@ -825,3 +857,12 @@ TEST_CASE("Many successors") {
         ManySuccessors::test( p );
     }
 }
+
+#if TBB_USE_EXCEPTIONS
+//! Test for exceptions
+//! \brief \ref error_guessing
+TEST_CASE("Exceptions") {
+    Exceptions::test();
+}
+#endif
+
diff --git a/test/tbb/test_flow_graph_whitebox.cpp b/test/tbb/test_flow_graph_whitebox.cpp
index 8e5935c05d..667d0e1597 100644
--- a/test/tbb/test_flow_graph_whitebox.cpp
+++ b/test/tbb/test_flow_graph_whitebox.cpp
@@ -408,19 +408,52 @@ TestJoinNode() {
     INFO(" done\n");
 }
 
-void
-TestLimiterNode() {
+template <typename DecrementerType>
+struct limiter_node_type {
+    using type = tbb::flow::limiter_node<int, DecrementerType>;
+    using dtype = DecrementerType;
+};
+
+template <>
+struct limiter_node_type<void> {
+    using type = tbb::flow::limiter_node<int>;
+    using dtype = tbb::flow::continue_msg;
+};
+
+template <typename DType>
+struct DecrementerHelper {
+    template <typename Decrementer>
+    static void check(Decrementer&&) {}
+    static DType makeDType() {
+        return DType(1);
+    }
+};
+
+template <>
+struct DecrementerHelper<tbb::flow::continue_msg> {
+    template <typename Decrementer>
+    static void check(Decrementer&& d) {
+        CHECK_MESSAGE(d.my_predecessor_count == 0, "error in pred count");
+        CHECK_MESSAGE(d.my_initial_predecessor_count == 0, "error in initial pred count");
+        CHECK_MESSAGE(d.my_current_count == 0, "error in current count");
+    }
+    static tbb::flow::continue_msg makeDType() {
+        return tbb::flow::continue_msg();
+    }
+};
+
+template <typename DecrementerType>
+void TestLimiterNode() {
     int out_int{};
     tbb::flow::graph g;
-    tbb::flow::limiter_node<int> ln(g,1);
+    using dtype = typename limiter_node_type<DecrementerType>::dtype;
+    typename limiter_node_type<DecrementerType>::type ln(g,1);
     INFO("Testing limiter_node: preds and succs");
-    CHECK_MESSAGE( (ln.decrement.my_predecessor_count == 0), "error in pred count");
-    CHECK_MESSAGE( (ln.decrement.my_initial_predecessor_count == 0), "error in initial pred count");
-    CHECK_MESSAGE( (ln.decrement.my_current_count == 0), "error in current count");
+    DecrementerHelper<dtype>::check(ln.decrement);
     CHECK_MESSAGE( (ln.my_threshold == 1), "error in my_threshold");
     tbb::flow::queue_node<int> inq(g);
     tbb::flow::queue_node<int> outq(g);
-    tbb::flow::broadcast_node<tbb::flow::continue_msg> bn(g);
+    tbb::flow::broadcast_node<dtype> bn(g);
 
     tbb::flow::make_edge(inq,ln);
     tbb::flow::make_edge(ln,outq);
@@ -437,7 +470,7 @@ TestLimiterNode() {
     g.wait_for_all();
     CHECK_MESSAGE( (!outq.try_get(out_int)), "limiter_node incorrectly passed second input");
     CHECK_MESSAGE( (!ln.my_predecessors.empty()), "input edge to limiter_node not reversed");
-    bn.try_put(tbb::flow::continue_msg());
+    bn.try_put(DecrementerHelper<dtype>::makeDType());
     g.wait_for_all();
     CHECK_MESSAGE( (outq.try_get(out_int) && out_int == 2), "limiter_node didn't pass second value");
     g.wait_for_all();
@@ -451,9 +484,7 @@ TestLimiterNode() {
     INFO(" rf_clear_edges");
     // currently the limiter_node will not pass another message
     g.reset(tbb::flow::rf_clear_edges);
-    CHECK_MESSAGE( (ln.decrement.my_predecessor_count == 0), "error in pred count");
-    CHECK_MESSAGE( (ln.decrement.my_initial_predecessor_count == 0), "error in initial pred count");
-    CHECK_MESSAGE( (ln.decrement.my_current_count == 0), "error in current count");
+    DecrementerHelper<dtype>::check(ln.decrement);
     CHECK_MESSAGE( (ln.my_threshold == 1), "error in my_threshold");
     CHECK_MESSAGE( (ln.my_predecessors.empty()), "preds not reset(rf_clear_edges)");
     CHECK_MESSAGE( (ln.my_successors.empty()), "preds not reset(rf_clear_edges)");
@@ -467,7 +498,7 @@ TestLimiterNode() {
     g.wait_for_all();
     CHECK_MESSAGE( (outq.try_get(out_int)),"missing output after reset(rf_clear_edges)");
     CHECK_MESSAGE( (out_int == 4), "input incorrect (4)");
-    bn.try_put(tbb::flow::continue_msg());
+    bn.try_put(DecrementerHelper<dtype>::makeDType());
     g.wait_for_all();
     CHECK_MESSAGE( (!outq.try_get(out_int)),"second output incorrectly passed (rf_clear_edges)");
     INFO(" done\n");
@@ -747,7 +778,9 @@ TEST_CASE("Test join node"){
 //! Test limiter_node
 //! \brief \ref error_guessing
 TEST_CASE("Test limiter node"){
-    TestLimiterNode();
+    TestLimiterNode<void>();
+    TestLimiterNode<int>();
+    TestLimiterNode<tbb::flow::continue_msg>();
 }
 
 //! Test indexer_node
@@ -769,3 +802,135 @@ TEST_CASE("Test scalar node"){
     TestScalarNode<tbb::flow::overwrite_node<int> >("overwrite_node");
     TestScalarNode<tbb::flow::write_once_node<int> >("write_once_node");
 }
+
+//! try_get in inactive graph
+//! \brief \ref error_guessing
+TEST_CASE("try_get in inactive graph"){
+    tbb::flow::graph g;
+
+    tbb::flow::input_node<int> src(g, [&](tbb::flow_control& fc) -> bool { fc.stop(); return 0;});
+    deactivate_graph(g);
+
+    int tmp = -1;
+    CHECK_MESSAGE((src.try_get(tmp) == false), "try_get can not succeed");
+
+    src.activate();
+    tmp = -1;
+    CHECK_MESSAGE((src.try_get(tmp) == false), "try_get can not succeed");
+}
+
+//! Test make_edge in inactive graph
+//! \brief \ref error_guessing
+TEST_CASE("Test make_edge in inactive graph"){
+    tbb::flow::graph g;
+
+    tbb::flow::continue_node<int> c(g, [](const tbb::flow::continue_msg&){ return 1; });
+
+    tbb::flow::function_node<int, int> f(g, tbb::flow::serial, serial_fn_body<int>(serial_fn_state0));
+
+    c.try_put(tbb::flow::continue_msg());
+    g.wait_for_all();
+
+    deactivate_graph(g);
+
+    make_edge(c, f);
+}
+
+//! Test make_edge from overwrite_node in inactive graph
+//! \brief \ref error_guessing
+TEST_CASE("Test make_edge from overwrite_node in inactive graph"){
+    tbb::flow::graph g;
+
+    tbb::flow::queue_node<int> q(g);
+
+    tbb::flow::overwrite_node<int> on(g);
+
+    on.try_put(1);
+    g.wait_for_all();
+
+    deactivate_graph(g);
+
+    make_edge(on, q);
+
+    int tmp = -1;
+    CHECK_MESSAGE((q.try_get(tmp) == false), "Message should not be passed on");
+}
+
+//! Test iterators directly
+//! \brief \ref error_guessing
+TEST_CASE("graph_iterator details"){
+    tbb::flow::graph g;
+    const tbb::flow::graph cg;
+
+    tbb::flow::graph::iterator b = g.begin();
+    tbb::flow::graph::iterator b2 = g.begin();
+    ++b2;
+    // Cast to a volatile pointer to workaround self assignment warnings from some compilers.
+    tbb::flow::graph::iterator* volatile b2_ptr = &b2;
+    b2 = *b2_ptr;
+    b = b2;
+    CHECK_MESSAGE((b == b2), "Assignment should make iterators equal");
+}
+
+//! const graph
+//! \brief \ref error_guessing
+TEST_CASE("const graph"){
+    using namespace tbb::flow;
+
+    const graph g;
+    CHECK_MESSAGE((g.cbegin() == g.cend()), "Starting graph is empty");
+    CHECK_MESSAGE((g.begin() == g.end()), "Starting graph is empty");
+
+    graph g2;
+    CHECK_MESSAGE((g2.begin() == g2.end()), "Starting graph is empty");
+}
+
+//! Send message to continue_node while graph is inactive
+//! \brief \ref error_guessing
+TEST_CASE("Send message to continue_node while graph is inactive") {
+    using namespace tbb::flow;
+
+    graph g;
+
+    continue_node<int> c(g, [](const continue_msg&){ return 1; });
+    buffer_node<int> b(g);
+
+    make_edge(c, b);
+    
+    deactivate_graph(g);
+
+    c.try_put(continue_msg());
+    g.wait_for_all();
+
+    int tmp = -1;
+    CHECK_MESSAGE((b.try_get(tmp) == false), "Message should not arrive");
+    CHECK_MESSAGE((tmp == -1), "Value should not be altered");
+}
+
+
+//! Bypass of a successor's message in a node with lightweight policy
+//! \brief \ref error_guessing
+TEST_CASE("Bypass of a successor's message in a node with lightweight policy") {
+    using namespace tbb::flow;
+
+    graph g;
+
+    auto body = [](const int&v)->int { return v * 2; };
+    function_node<int, int, lightweight> f1(g, unlimited, body);
+
+    auto body2 = [](const int&v)->int {return v / 2;};
+    function_node<int, int> f2(g, unlimited, body2);
+    
+    buffer_node<int> b(g);
+    
+    make_edge(f1, f2);
+    make_edge(f2, b);
+
+    f1.try_put(1);
+    g.wait_for_all();
+
+    int tmp = -1;
+    CHECK_MESSAGE((b.try_get(tmp) == true), "Functional nodes can work in succession");
+    CHECK_MESSAGE((tmp == 1), "Value should not be altered");
+}
+
diff --git a/test/tbb/test_function_node.cpp b/test/tbb/test_function_node.cpp
index 5115c12430..40e638b9ee 100644
--- a/test/tbb/test_function_node.cpp
+++ b/test/tbb/test_function_node.cpp
@@ -464,7 +464,7 @@ void test_follows_and_precedes_api() {
         (messages_for_follows, tbb::flow::unlimited, pass_msg);
     follows_and_precedes_testing::test_precedes
         <msg_t, tbb::flow::function_node<msg_t, msg_t>>
-        (messages_for_precedes, tbb::flow::unlimited, pass_msg);
+        (messages_for_precedes, tbb::flow::unlimited, pass_msg, tbb::flow::node_priority_t(1));
 }
 #endif
 
@@ -544,3 +544,14 @@ TEST_CASE("Deduction guides test"){
      test_deduction_guides();
 }
 #endif
+
+//! try_release and try_consume test
+//! \brief \ref error_guessing
+TEST_CASE("try_release try_consume"){
+    tbb::flow::graph g;
+
+    tbb::flow::function_node<int, int> fn(g, tbb::flow::unlimited, [](const int&v){return v;});
+
+    CHECK_MESSAGE((fn.try_release()==false), "try_release should initially return false on a node");
+    CHECK_MESSAGE((fn.try_consume()==false), "try_consume should initially return false on a node");
+}
diff --git a/test/tbb/test_indexer_node.cpp b/test/tbb/test_indexer_node.cpp
index 26d6a72989..9ae121bc64 100644
--- a/test/tbb/test_indexer_node.cpp
+++ b/test/tbb/test_indexer_node.cpp
@@ -14,6 +14,7 @@
     limitations under the License.
 */
 
+#define MAX_TUPLE_TEST_SIZE 10
 #include "common/config.h"
 
 #include "tbb/flow_graph.h"
@@ -253,16 +254,18 @@ class parallel_test {
         }
         for(int nInputs = 1; nInputs <= MaxNInputs; ++nInputs) {
             tbb::flow::graph g;
-            IType* my_indexer = new IType(g); //makeIndexer<IType>::create();
+            IType* my_indexer_ptr = new IType(g); //makeIndexer<IType>::create();
+            IType my_indexer = *my_indexer_ptr;
             tbb::flow::queue_node<TType> outq1(g);
             tbb::flow::queue_node<TType> outq2(g);
 
-            tbb::flow::make_edge(*my_indexer, outq1);
-            tbb::flow::make_edge(*my_indexer, outq2);
+            tbb::flow::make_edge(my_indexer, outq1);
+            tbb::flow::make_edge(my_indexer, outq2);
 
-            input_node_helper<SIZE, IType>::add_input_nodes((*my_indexer), g, nInputs);
+            input_node_helper<SIZE, IType>::add_input_nodes(my_indexer, g, nInputs);
 
             g.wait_for_all();
+            makeIndexer<IType>::destroy(my_indexer_ptr);
 
             reset_outputCheck(SIZE, Count);
             for(int i=0; i < Count*SIZE; ++i) {
@@ -282,10 +285,9 @@ class parallel_test {
             CHECK_MESSAGE(!outq1.try_get(v), "");
             CHECK_MESSAGE(!outq2.try_get(v), "");
 
-            input_node_helper<SIZE, IType>::remove_input_nodes((*my_indexer), nInputs);
-            tbb::flow::remove_edge(*my_indexer, outq1);
-            tbb::flow::remove_edge(*my_indexer, outq2);
-            makeIndexer<IType>::destroy(my_indexer);
+            input_node_helper<SIZE, IType>::remove_input_nodes(my_indexer, nInputs);
+            tbb::flow::remove_edge(my_indexer, outq1);
+            tbb::flow::remove_edge(my_indexer, outq2);
         }
     }
 };
@@ -551,7 +553,7 @@ class generate_test<TestType, T0> {
 
 #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
 template<typename tagged_msg_t, typename input_t>
-void check_edge(tbb::flow::graph& g,
+bool check_edge(tbb::flow::graph& g,
                 tbb::flow::broadcast_node<input_t>& start,
                 tbb::flow::buffer_node<tagged_msg_t>& buf,
                 input_t input_value) {
@@ -563,27 +565,36 @@ void check_edge(tbb::flow::graph& g,
 
     CHECK_MESSAGE( ((is_get_succeeded)), "There is no item in the buffer");
     CHECK_MESSAGE( ((tbb::flow::cast_to<input_t>(msg) == input_value)), "Wrong item value");
+    return true;
 }
 
-void test_follows() {
+template <typename... T>
+void sink(T...) {}
+
+template <typename indexer_output_t, typename Type, typename BN, std::size_t... Seq>
+void check_edge(tbb::flow::graph& g, BN& bn, tbb::flow::buffer_node<indexer_output_t>& buf, Type, tbb::detail::index_sequence<Seq...>) {
+    sink(check_edge<indexer_output_t>(g, std::get<Seq>(bn), buf, typename std::tuple_element<Seq, Type>::type(Seq))...);
+}
+
+template <typename... Args, std::size_t... Seq>
+void test_follows_impl(std::tuple<Args...> t, tbb::detail::index_sequence<Seq...> seq) {
     using namespace tbb::flow;
-    using indexer_output_t = indexer_node<int, float, double>::output_type;
+    using indexer_output_t = typename indexer_node<Args...>::output_type;
 
     graph g;
-    broadcast_node<continue_msg> start(g);
-
-    broadcast_node<int> start1(g);
-    broadcast_node<float> start2(g);
-    broadcast_node<double> start3(g);
+    auto bn = std::make_tuple(broadcast_node<Args>(g)...);
 
-    indexer_node<int, float, double> my_indexer(follows(start1, start2, start3));
+    indexer_node<Args...> my_indexer(follows(std::get<Seq>(bn)...));
 
     buffer_node<indexer_output_t> buf(g);
     make_edge(my_indexer, buf);
 
-    check_edge<indexer_output_t, int>(g, start1, buf, 1);
-    check_edge<indexer_output_t, float>(g, start2, buf, 2.2f);
-    check_edge<indexer_output_t, double>(g, start3, buf, 3.3);
+    check_edge<indexer_output_t>(g, bn, buf, t, seq);
+}
+
+template <typename... Args>
+void test_follows() {
+    test_follows_impl(std::tuple<Args...>(), tbb::detail::make_index_sequence<sizeof...(Args)>());
 }
 
 void test_precedes() {
@@ -613,7 +624,16 @@ void test_precedes() {
 }
 
 void test_follows_and_precedes_api() {
-    test_follows();
+    test_follows<double>();
+    test_follows<int, double>();
+    test_follows<int, float, double>();
+    test_follows<float, double, int, double>();
+    test_follows<float, double, int, double, double>();
+    test_follows<float, double, int, double, double, float>();
+    test_follows<float, double, int, double, double, float, long>();
+    test_follows<float, double, int, double, double, float, long, int>();
+    test_follows<float, double, int, double, double, float, long, int, long>();
+    test_follows<float, double, int, double, double, float, long, int, float, long>();
     test_precedes();
 }
 #endif // __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
@@ -646,7 +666,7 @@ TEST_CASE("Serial and parallel test") {
    for (int p = 0; p < 2; ++p) {
        generate_test<serial_test, float>::do_test();
 #if MAX_TUPLE_TEST_SIZE >= 4
-       generate_test<serial_test, float, double, int>::do_test();
+       generate_test<serial_test, float, double, int, short>::do_test();
 #endif
 #if MAX_TUPLE_TEST_SIZE >= 6
        generate_test<serial_test, double, double, int, long, int, short>::do_test();
diff --git a/test/tbb/test_input_node.cpp b/test/tbb/test_input_node.cpp
index d1867feef5..9294398d08 100644
--- a/test/tbb/test_input_node.cpp
+++ b/test/tbb/test_input_node.cpp
@@ -63,8 +63,6 @@ class test_push_receiver : public tbb::flow::receiver<T>, utils::NoAssign {
     tbb::flow::graph& graph_reference() const override {
         return my_graph;
     }
-
-    void reset_receiver(tbb::flow::reset_flags /*f*/) override {}
 };
 
 template< typename T >
@@ -367,3 +365,12 @@ TEST_CASE("Deduction guides"){
 }
 #endif
 
+//! Test try_get before activation
+//! \brief \ref error_guessing
+TEST_CASE("try_get before activation"){
+    tbb::flow::graph g;
+    tbb::flow::input_node<int> in(g, [&](tbb::flow_control& fc) -> bool { fc.stop(); return 0;});
+
+    int tmp = -1;
+    CHECK_MESSAGE((in.try_get(tmp) == false), "try_get before activation should not succeed");
+}
diff --git a/test/tbb/test_intrusive_list.cpp b/test/tbb/test_intrusive_list.cpp
index 3412ad69fa..fddbc2060f 100644
--- a/test/tbb/test_intrusive_list.cpp
+++ b/test/tbb/test_intrusive_list.cpp
@@ -81,6 +81,10 @@ void check_list_nodes( List& il, int value_step ) {
 
     int i;
     Iterator it = il.begin();
+
+    Iterator it_default;
+    REQUIRE_MESSAGE(it_default != it, "Incorrect default constructed intrusive_list::iterator");
+
     for ( i = value_step - 1; it != il.end(); ++it, i += value_step ) {
         REQUIRE_MESSAGE(it->Data() == i, "Unexpected node value while iterating forward");
         REQUIRE_MESSAGE(it->m_Canary == NoliMeTangere, "Memory corruption");
diff --git a/test/tbb/test_join_node.cpp b/test/tbb/test_join_node.cpp
index 585cc78508..945c851960 100644
--- a/test/tbb/test_join_node.cpp
+++ b/test/tbb/test_join_node.cpp
@@ -134,7 +134,15 @@ void test_follows_and_precedes_api() {
         <msg_t, tbb::flow::join_node<JoinOutputType, tbb::flow::queueing>>(messages_for_follows);
     follows_and_precedes_testing::test_follows
         <msg_t, tbb::flow::join_node<JoinOutputType, tbb::flow::reserving>, tbb::flow::buffer_node<msg_t>>(messages_for_follows);
-    // TODO: add tests for key_matching and message based key matching
+    auto b = [](msg_t) { return msg_t(); };
+    class hash_compare {
+    public:
+        std::size_t hash(msg_t) const { return 0; }
+        bool equal(msg_t, msg_t) const { return true; }
+    };
+    follows_and_precedes_testing::test_follows
+        <msg_t, tbb::flow::join_node<JoinOutputType, tbb::flow::key_matching<msg_t, hash_compare>>, tbb::flow::buffer_node<msg_t>>
+        (messages_for_follows, b, b, b);
 
     follows_and_precedes_testing::test_precedes
         <msg_t, tbb::flow::join_node<JoinOutputType>>(messages_for_precedes);
@@ -142,6 +150,9 @@ void test_follows_and_precedes_api() {
         <msg_t, tbb::flow::join_node<JoinOutputType, tbb::flow::queueing>>(messages_for_precedes);
     follows_and_precedes_testing::test_precedes
         <msg_t, tbb::flow::join_node<JoinOutputType, tbb::flow::reserving>>(messages_for_precedes);
+    follows_and_precedes_testing::test_precedes
+        <msg_t, tbb::flow::join_node<JoinOutputType, tbb::flow::key_matching<msg_t, hash_compare>>>
+        (messages_for_precedes, b, b, b);
 }
 #endif
 
@@ -175,6 +186,101 @@ void test_deduction_guides() {
 
 #endif
 
+namespace multiple_predecessors {
+
+using namespace tbb::flow;
+
+using join_node_t = join_node<std::tuple<continue_msg, continue_msg, continue_msg>, reserving>;
+using queue_node_t = queue_node<std::tuple<continue_msg, continue_msg, continue_msg>>;
+
+void twist_join_connections(
+    buffer_node<continue_msg>& bn1, buffer_node<continue_msg>& bn2, buffer_node<continue_msg>& bn3,
+    join_node_t& jn)
+{
+    // order, in which edges are created/destroyed, is important
+    make_edge(bn1, input_port<0>(jn));
+    make_edge(bn2, input_port<0>(jn));
+    make_edge(bn3, input_port<0>(jn));
+
+    remove_edge(bn3, input_port<0>(jn));
+    make_edge  (bn3, input_port<2>(jn));
+
+    remove_edge(bn2, input_port<0>(jn));
+    make_edge  (bn2, input_port<1>(jn));
+}
+
+std::unique_ptr<join_node_t> connect_join_via_make_edge(
+    graph& g, buffer_node<continue_msg>& bn1, buffer_node<continue_msg>& bn2,
+    buffer_node<continue_msg>& bn3, queue_node_t& qn)
+{
+    std::unique_ptr<join_node_t> jn( new join_node_t(g) );
+    twist_join_connections( bn1, bn2, bn3, *jn );
+    make_edge(*jn, qn);
+    return jn;
+}
+
+#if TBB_PREVIEW_FLOW_GRAPH_FEATURES
+std::unique_ptr<join_node_t> connect_join_via_follows(
+    graph&, buffer_node<continue_msg>& bn1, buffer_node<continue_msg>& bn2,
+    buffer_node<continue_msg>& bn3, queue_node_t& qn)
+{
+    auto bn_set = make_node_set(bn1, bn2, bn3);
+    std::unique_ptr<join_node_t> jn( new join_node_t(follows(bn_set)) );
+    make_edge(*jn, qn);
+    return jn;
+}
+
+std::unique_ptr<join_node_t> connect_join_via_precedes(
+    graph&, buffer_node<continue_msg>& bn1, buffer_node<continue_msg>& bn2,
+    buffer_node<continue_msg>& bn3, queue_node_t& qn)
+{
+    auto qn_set = make_node_set(qn);
+    auto qn_copy_set = qn_set;
+    std::unique_ptr<join_node_t> jn( new join_node_t(precedes(qn_copy_set)) );
+    twist_join_connections( bn1, bn2, bn3, *jn );
+    return jn;
+}
+#endif // TBB_PREVIEW_FLOW_GRAPH_FEATURES
+
+void run_and_check(
+    graph& g, buffer_node<continue_msg>& bn1, buffer_node<continue_msg>& bn2,
+    buffer_node<continue_msg>& bn3, queue_node_t& qn, bool expected)
+{
+    std::tuple<continue_msg, continue_msg, continue_msg> msg;
+
+    bn1.try_put(continue_msg());
+    bn2.try_put(continue_msg());
+    bn3.try_put(continue_msg());
+    g.wait_for_all();
+
+    CHECK_MESSAGE(
+        (qn.try_get(msg) == expected),
+        "Unexpected message absence/existence at the end of the graph."
+    );
+}
+
+template<typename ConnectJoinNodeFunc>
+void test(ConnectJoinNodeFunc&& connect_join_node) {
+    graph g;
+    buffer_node<continue_msg> bn1(g);
+    buffer_node<continue_msg> bn2(g);
+    buffer_node<continue_msg> bn3(g);
+    queue_node_t qn(g);
+
+    auto jn = connect_join_node(g, bn1, bn2, bn3, qn);
+
+    run_and_check(g, bn1, bn2, bn3, qn, /*expected=*/true);
+
+    remove_edge(bn3, input_port<2>(*jn));
+    remove_edge(bn2, input_port<1>(*jn));
+    remove_edge(bn1, input_port<0>(*jn));
+    remove_edge(*jn, qn);
+
+    run_and_check(g, bn1, bn2, bn3, qn, /*expected=*/false);
+}
+} // namespace multiple_predecessors
+
+
 #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
 //! Test follows and precedes API
 //! \brief \ref error_guessing
@@ -210,3 +316,15 @@ TEST_CASE("Main test"){
 TEST_CASE("Recirculation test"){
     generate_recirc_test<std::tuple<int,float> >::do_test();
 }
+
+//! Test maintaining correct count of ports without input
+//! \brief \ref error_guessing
+TEST_CASE("Test removal of the predecessor while having none") {
+    using namespace multiple_predecessors;
+
+    test(connect_join_via_make_edge);
+#if TBB_PREVIEW_FLOW_GRAPH_FEATURES
+    test(connect_join_via_follows);
+    test(connect_join_via_precedes);
+#endif
+}
diff --git a/test/tbb/test_join_node.h b/test/tbb/test_join_node.h
index 43bba1c3cf..35a2778ec0 100644
--- a/test/tbb/test_join_node.h
+++ b/test/tbb/test_join_node.h
@@ -449,7 +449,7 @@ class name_of<MyKeySecond<K, V> > {
 };
 
 // The additional policy to differ message based key matching from usual key matching.
-// It only has sense for the test because join_node is created with the key_matching policy for the both cases.
+// It only makes sense for the test because join_node is created with the key_matching policy for the both cases.
 template <typename K, typename KHash = tbb_hash_compare<typename std::decay<K>::type > >
 struct message_based_key_matching {};
 
@@ -1308,7 +1308,7 @@ class serial_queue_helper<1, JType> {
 };
 
 //
-// Single reservable predecessor at each port, single accepting successor
+// Single reservable predecessor at each port, single accepting and rejecting successor
 //   * put to buffer before port0, then put to buffer before port1, ...
 //   * fill buffer before port0 then fill buffer before port1, ...
 
@@ -1320,7 +1320,7 @@ void test_one_serial(JType &my_join, tbb::flow::graph &g) {
     std::vector<bool> flags;
     serial_queue_helper<TUPLE_SIZE, JType>::add_queue_nodes(g, my_join);
     typedef TType q3_input_type;
-    tbb::flow::queue_node< q3_input_type >  q3(g);
+    tbb::flow::queue_node< q3_input_type > q3(g);
 
     tbb::flow::make_edge(my_join, q3);
 
@@ -1354,18 +1354,23 @@ void test_one_serial(JType &my_join, tbb::flow::graph &g) {
         }
     }
 
+    tbb::flow::remove_edge(my_join, q3);
+    tbb::flow::limiter_node<q3_input_type> limiter(g, Count / 2);
+    tbb::flow::make_edge(my_join, limiter);
+    tbb::flow::make_edge(limiter, q3);
+
     // fill each queue completely before filling the next.
     serial_queue_helper<TUPLE_SIZE, JType>::fill_one_queue(Count);
 
     g.wait_for_all();
-    for(int i = 0; i < Count; ++i) {
+    for(int i = 0; i < Count / 2; ++i) {
         q3_input_type v;
         g.wait_for_all();
         CHECK_MESSAGE( (q3.try_get(v)), "Error in try_get()");
         if(is_key_matching) {
             int j = int(std::get<0>(v))/2;
             serial_queue_helper<TUPLE_SIZE, JType>::check_queue_value(j, v);
-            flags[i] = true;
+            flags[j] = true;
         }
         else {
             serial_queue_helper<TUPLE_SIZE, JType>::check_queue_value(i, v);
@@ -1373,13 +1378,10 @@ void test_one_serial(JType &my_join, tbb::flow::graph &g) {
     }
 
     if(is_key_matching) {
-        for(int i = 0; i < Count; ++i) {
-            CHECK_MESSAGE(flags[i], "");
-        }
+        CHECK(std::count(flags.begin(), flags.end(), true) == Count / 2);
     }
 
     serial_queue_helper<TUPLE_SIZE, JType>::remove_queue_nodes(my_join);
-
 }
 
 template<typename JType, class JP>
diff --git a/test/tbb/test_join_node_key_matching.cpp b/test/tbb/test_join_node_key_matching.cpp
index b6eac84048..33ffaa1c95 100644
--- a/test/tbb/test_join_node_key_matching.cpp
+++ b/test/tbb/test_join_node_key_matching.cpp
@@ -14,6 +14,7 @@
     limitations under the License.
 */
 
+#define MAX_TUPLE_TEST_SIZE 10
 #include "common/config.h"
 
 #include "test_join_node.h"
@@ -52,26 +53,32 @@ void test_deduction_guides() {
 }
 #endif
 
+template <typename T1, typename T2>
+using make_tuple = decltype(std::tuple_cat(T1(), std::tuple<T2>()));
+using T1 = std::tuple<MyKeyFirst<std::string, double>>;
+using T2 = make_tuple<T1, MyKeySecond<std::string, int>>;
+using T3 = make_tuple<T2, MyKeyFirst<std::string, int>>;
+using T4 = make_tuple<T3, MyKeyWithBrokenMessageKey<std::string, size_t>>;
+using T5 = make_tuple<T4, MyKeyWithBrokenMessageKey<std::string, int>>;
+using T6 = make_tuple<T5, MyKeySecond<std::string, short>>;
+using T7 = make_tuple<T6, MyKeySecond<std::string, threebyte>>;
+using T8 = make_tuple<T7, MyKeyFirst<std::string, int>>;
+using T9 = make_tuple<T8, MyKeySecond<std::string, threebyte>>;
+using T10 = make_tuple<T9, MyKeyWithBrokenMessageKey<std::string, size_t>>;
+
 //! Test serial key matching on special input types
 //! \brief \ref error_guessing
-TEST_CASE("Serial test on tuples"){
+TEST_CASE("Serial test on tuples") {
     INFO("key_matching\n");
     generate_test<serial_test, std::tuple<MyKeyFirst<int, double>, MyKeySecond<int, float> >, tbb::flow::key_matching<int> >::do_test();
     generate_test<serial_test, std::tuple<MyKeyFirst<std::string, double>, MyKeySecond<std::string, float> >, tbb::flow::key_matching<std::string> >::do_test();
-#if MAX_TUPLE_TEST_SIZE >= 3
     generate_test<serial_test, std::tuple<MyKeyFirst<std::string, double>, MyKeySecond<std::string, float>, MyKeyWithBrokenMessageKey<std::string, int> >, tbb::flow::key_matching<std::string&> >::do_test();
-#endif
-#if MAX_TUPLE_TEST_SIZE >= 7
-    generate_test<serial_test, std::tuple<
-        MyKeyFirst<std::string, double>,
-        MyKeyWithBrokenMessageKey<std::string, int>,
-        MyKeyFirst<std::string, int>,
-        MyKeySecond<std::string, size_t>,
-        MyKeyWithBrokenMessageKey<std::string, int>,
-        MyKeySecond<std::string, short>,
-        MyKeySecond<std::string, threebyte>
-    >, tbb::flow::key_matching<std::string&> >::do_test();
-#endif
+}
+
+//! Serial test with different tuple sizes
+//! \brief \ref error_guessing
+TEST_CASE_TEMPLATE("Serial N tests on tuples", T, T2, T3, T4, T5, T6, T7, T8, T9, T10) {
+     generate_test<serial_test, T, tbb::flow::key_matching<std::string&>>::do_test();
 }
 
 #if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
@@ -88,19 +95,10 @@ TEST_CASE("Parallel test on tuples"){
     generate_test<parallel_test, std::tuple<MyKeyFirst<int, double>, MyKeySecond<int, float> >, tbb::flow::key_matching<int> >::do_test();
     generate_test<parallel_test, std::tuple<MyKeyFirst<int, double>, MyKeySecond<int, float> >, tbb::flow::key_matching<int&> >::do_test();
     generate_test<parallel_test, std::tuple<MyKeyFirst<std::string, double>, MyKeySecond<std::string, float> >, tbb::flow::key_matching<std::string&> >::do_test();
+}
 
-#if MAX_TUPLE_TEST_SIZE >= 10
-    generate_test<parallel_test, std::tuple<
-        MyKeyFirst<std::string, double>,
-        MyKeySecond<std::string, int>,
-        MyKeyFirst<std::string, int>,
-        MyKeyWithBrokenMessageKey<std::string, size_t>,
-        MyKeyWithBrokenMessageKey<std::string, int>,
-        MyKeySecond<std::string, short>,
-        MyKeySecond<std::string, threebyte>,
-        MyKeyFirst<std::string, int>,
-        MyKeySecond<std::string, threebyte>,
-        MyKeyWithBrokenMessageKey<std::string, size_t>
-    >, tbb::flow::key_matching<std::string&> >::do_test();
-#endif
+//! Parallel test with different tuple sizes
+//! \brief \ref error_guessing
+TEST_CASE_TEMPLATE("Parallel N tests on tuples", T, T2, T3, T4, T5, T6, T7, T8, T9, T10) {
+    generate_test<parallel_test, T, tbb::flow::key_matching<std::string&>>::do_test();
 }
diff --git a/test/tbb/test_join_node_msg_key_matching.cpp b/test/tbb/test_join_node_msg_key_matching.cpp
index 533bb7fdf0..c50ba30cbe 100644
--- a/test/tbb/test_join_node_msg_key_matching.cpp
+++ b/test/tbb/test_join_node_msg_key_matching.cpp
@@ -16,6 +16,7 @@
 
 // Message based key matching is a preview feature
 #define TBB_PREVIEW_FLOW_GRAPH_FEATURES 1
+#define MAX_TUPLE_TEST_SIZE 10
 
 #include "common/config.h"
 
@@ -24,6 +25,25 @@
 //! \file test_join_node_msg_key_matching.cpp
 //! \brief Test for [preview] functionality
 
+#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
+#include <array>
+#include <vector>
+void test_follows_and_precedes_api() {
+    using msg_t = MyMessageKeyWithoutKey<int, int>;
+    using JoinOutputType = std::tuple<msg_t, msg_t, msg_t>;
+
+    std::array<msg_t, 3> messages_for_follows = { {msg_t(), msg_t(), msg_t()} };
+    std::vector<msg_t> messages_for_precedes = { msg_t(), msg_t(), msg_t() };
+
+    follows_and_precedes_testing::test_follows
+        <msg_t, tbb::flow::join_node<JoinOutputType, tbb::flow::key_matching<std::size_t>>, tbb::flow::buffer_node<msg_t>>
+        (messages_for_follows);
+    follows_and_precedes_testing::test_precedes
+        <msg_t, tbb::flow::join_node<JoinOutputType, tbb::flow::key_matching<std::size_t>>>
+        (messages_for_precedes);
+}
+#endif
+
 #if __TBB_CPP17_DEDUCTION_GUIDES_PRESENT
 struct message_key {
     int my_key;
@@ -44,54 +64,41 @@ void test_deduction_guides() {
     broadcast_node<message_key> bm1(g), bm2(g);
     broadcast_node<tuple_type> bm3(g);
     join_node<tuple_type, key_matching<int> > j0(g);
-
-#if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
-    join_node j1(follows(bm1, bm2), key_matching<int>());
-    static_assert(std::is_same_v<decltype(j1), join_node<tuple_type, key_matching<int>>>);
-
-    join_node j2(precedes(bm3), key_matching<int>());
-    static_assert(std::is_same_v<decltype(j2), join_node<tuple_type, key_matching<int>>>);
-#endif
-
     join_node j3(j0);
     static_assert(std::is_same_v<decltype(j3), join_node<tuple_type, key_matching<int>>>);
 }
 #endif
 
-//! Serial test with different tuple sizes
+//! Serial test with matching policies
 //! \brief \ref error_guessing
-TEST_CASE("Serial test"){
+TEST_CASE("Serial test") {
     generate_test<serial_test, std::tuple<MyMessageKeyWithBrokenKey<int, double>, MyMessageKeyWithoutKey<int, float> >, message_based_key_matching<int> >::do_test();
     generate_test<serial_test, std::tuple<MyMessageKeyWithoutKeyMethod<std::string, double>, MyMessageKeyWithBrokenKey<std::string, float> >, message_based_key_matching<std::string> >::do_test();
-#if MAX_TUPLE_TEST_SIZE >= 3
-    generate_test<serial_test, std::tuple<MyMessageKeyWithoutKey<std::string, double>, MyMessageKeyWithoutKeyMethod<std::string, float>, MyMessageKeyWithBrokenKey<std::string, int> >, message_based_key_matching<std::string&> >::do_test();
-#endif
-#if MAX_TUPLE_TEST_SIZE >= 7
-    generate_test<serial_test, std::tuple<
-        MyMessageKeyWithoutKey<std::string, double>,
-        MyMessageKeyWithoutKeyMethod<std::string, int>,
-        MyMessageKeyWithBrokenKey<std::string, int>,
-        MyMessageKeyWithoutKey<std::string, size_t>,
-        MyMessageKeyWithoutKeyMethod<std::string, int>,
-        MyMessageKeyWithBrokenKey<std::string, short>,
-        MyMessageKeyWithoutKey<std::string, threebyte>
-    >, message_based_key_matching<std::string&> >::do_test();
-#endif
+}
 
-#if MAX_TUPLE_TEST_SIZE >= 10
-    generate_test<parallel_test, std::tuple<
-        MyMessageKeyWithoutKeyMethod<std::string, double>,
-        MyMessageKeyWithBrokenKey<std::string, int>,
-        MyMessageKeyWithoutKey<std::string, int>,
-        MyMessageKeyWithoutKeyMethod<std::string, size_t>,
-        MyMessageKeyWithBrokenKey<std::string, int>,
-        MyMessageKeyWithoutKeyMethod<std::string, short>,
-        MyMessageKeyWithoutKeyMethod<std::string, threebyte>,
-        MyMessageKeyWithBrokenKey<std::string, int>,
-        MyMessageKeyWithoutKeyMethod<std::string, threebyte>,
-        MyMessageKeyWithBrokenKey<std::string, size_t>
-    >, message_based_key_matching<std::string&> >::do_test();
-#endif
+template <typename T1, typename T2>
+using make_tuple = decltype(std::tuple_cat(T1(), std::tuple<T2>()));
+using T1 = std::tuple<MyMessageKeyWithoutKeyMethod<std::string, double>>;
+using T2 = make_tuple<T1, MyMessageKeyWithBrokenKey<std::string, int>>;
+using T3 = make_tuple < T2, MyMessageKeyWithoutKey<std::string, int>>;
+using T4 = make_tuple < T3, MyMessageKeyWithoutKeyMethod<std::string, size_t>>;
+using T5 = make_tuple < T4, MyMessageKeyWithBrokenKey<std::string, int>>;
+using T6 = make_tuple < T5, MyMessageKeyWithoutKeyMethod<std::string, short>>;
+using T7 = make_tuple < T6, MyMessageKeyWithoutKeyMethod<std::string, threebyte>>;
+using T8 = make_tuple < T7, MyMessageKeyWithBrokenKey<std::string, int>>;
+using T9 = make_tuple < T8, MyMessageKeyWithoutKeyMethod<std::string, threebyte>>;
+using T10 = make_tuple < T9, MyMessageKeyWithBrokenKey<std::string, size_t>>;
+
+//! Serial test with different tuple sizes
+//! \brief \ref error_guessing
+TEST_CASE_TEMPLATE("Serial N tests", T, T2, T3, T4, T5, T6, T7, T8, T9, T10) {
+    generate_test<serial_test, T, message_based_key_matching<std::string&> >::do_test();
+}
+
+//! Parallel test with different tuple sizes
+//! \brief \ref error_guessing
+TEST_CASE_TEMPLATE("Parallel N tests", T, T2, T3, T4, T5, T6, T7, T8, T9, T10) {
+    generate_test<parallel_test, T, message_based_key_matching<std::string&> >::do_test();
 }
 
 //! Parallel test with special key types
diff --git a/test/tbb/test_limiter_node.cpp b/test/tbb/test_limiter_node.cpp
index 6d82adc0c5..825dd23202 100644
--- a/test/tbb/test_limiter_node.cpp
+++ b/test/tbb/test_limiter_node.cpp
@@ -51,8 +51,6 @@ struct serial_receiver : public tbb::flow::receiver<T>, utils::NoAssign {
     tbb::flow::graph& graph_reference() const override {
         return my_graph;
     }
-
-   void reset_receiver(tbb::flow::reset_flags /*f*/) override {next_value = T(0);}
 };
 
 template< typename T >
@@ -71,8 +69,6 @@ struct parallel_receiver : public tbb::flow::receiver<T>, utils::NoAssign {
     tbb::flow::graph& graph_reference() const override {
         return my_graph;
     }
-
-    void reset_receiver(tbb::flow::reset_flags /*f*/) override {my_count = 0;}
 };
 
 template< typename T >
@@ -414,6 +410,36 @@ void test_decrementer() {
     g.wait_for_all();
 }
 
+void test_try_put_without_successors() {
+    tbb::flow::graph g;
+    std::size_t try_put_num{3};
+    tbb::flow::buffer_node<int> bn(g);
+    tbb::flow::limiter_node<int> ln(g, try_put_num);
+    tbb::flow::make_edge(bn, ln);
+    std::size_t i = 1;
+    for (; i <= try_put_num; i++)
+        bn.try_put(i);
+
+    std::atomic<std::size_t> counter{0};
+    tbb::flow::function_node<int, int> fn(g, tbb::flow::unlimited,
+        [&](int input) {
+            counter += input;
+            return int{};
+        }
+    );
+    tbb::flow::make_edge(ln, fn);
+    g.wait_for_all();
+    CHECK((counter == i * try_put_num / 2));
+
+    // Check the lost message
+    tbb::flow::remove_edge(bn, ln);
+    ln.decrement.try_put(tbb::flow::continue_msg());
+    bn.try_put(try_put_num + 1);
+    g.wait_for_all();
+    CHECK((counter == i * try_put_num / 2));
+
+}
+
 #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
 #include <array>
 #include <vector>
@@ -492,6 +518,12 @@ TEST_CASE("Decrementer") {
     test_decrementer();
 }
 
+//! Test try_put() without successor
+//! \brief \ref error_guessing
+TEST_CASE("Test try_put() without successors") {
+    test_try_put_without_successors();
+}
+
 #if __TBB_PREVIEW_FLOW_GRAPH_NODE_SET
 //! Test follows and precedes API
 //! \brief \ref error_guessing
diff --git a/test/tbb/test_multifunction_node.cpp b/test/tbb/test_multifunction_node.cpp
index 5cfbd1c837..d8c24e6b67 100644
--- a/test/tbb/test_multifunction_node.cpp
+++ b/test/tbb/test_multifunction_node.cpp
@@ -552,5 +552,34 @@ TEST_CASE("Lightweight testing"){
 TEST_CASE("Test follows-precedes API"){
     test_follows_and_precedes_api();
 }
+//! Test priority constructor with follows and precedes API
+//! \brief \ref error_guessing
+TEST_CASE("Test priority with follows and precedes"){
+    using namespace tbb::flow;
+
+    using multinode = multifunction_node<int, std::tuple<int, int>>;
+
+    graph g;
+
+    buffer_node<int> b1(g);
+    buffer_node<int> b2(g);
+
+    multinode node(precedes(b1, b2), unlimited, [](const int& i, multinode::output_ports_type& op) -> void {
+            if (i % 2)
+                std::get<0>(op).try_put(i);
+            else
+                std::get<1>(op).try_put(i);
+        }
+        , node_priority_t(0));
+
+    node.try_put(0);
+    node.try_put(1);
+    g.wait_for_all();
+
+    int storage;
+    CHECK_MESSAGE((b1.try_get(storage) && !b1.try_get(storage) && b2.try_get(storage) && !b2.try_get(storage)),
+            "Not exact edge quantity was made");
+}
+
 #endif
 
diff --git a/test/tbb/test_openmp.cpp b/test/tbb/test_openmp.cpp
new file mode 100644
index 0000000000..8a4798f29f
--- /dev/null
+++ b/test/tbb/test_openmp.cpp
@@ -0,0 +1,161 @@
+/*
+    Copyright (c) 2005-2020 Intel Corporation
+
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+*/
+
+#include "common/test.h"
+#include "common/utils.h"
+#include "common/utils_env.h"
+#include "tbb/global_control.h"
+#include "tbb/blocked_range.h"
+#include "tbb/parallel_for.h"
+#include "tbb/parallel_reduce.h"
+
+// Test mixing OpenMP and TBB
+#include <omp.h>
+
+//! \file test_openmp.cpp
+//! \brief Test for [internal] functionality
+
+using data_type = short;
+
+void SerialConvolve( data_type c[], const data_type a[], int m, const data_type b[], int n ) {
+    for (int i = 0; i < m + n - 1; ++i) {
+        int start = i < n ? 0 : i - n + 1;
+        int finish = i < m ? i + 1 : m;
+        data_type sum = 0;
+        for (int j = start; j < finish; ++j)
+            sum += a[j] * b[i - j];
+        c[i] = sum;
+    }
+}
+
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+    // Suppress overzealous warning about short+=short
+    #pragma warning( push )
+    #pragma warning( disable: 4244 )
+#endif
+
+class InnerBody: utils::NoAssign {
+    const data_type* my_a;
+    const data_type* my_b;
+    const int i;
+public:
+    data_type sum;
+    InnerBody( data_type /*c*/[], const data_type a[], const data_type b[], int ii ) :
+        my_a(a), my_b(b), i(ii), sum(0)
+    {}
+    InnerBody( InnerBody& x, tbb::split ) :
+        my_a(x.my_a), my_b(x.my_b), i(x.i), sum(0)
+    {
+    }
+    void join( InnerBody& x ) { sum += x.sum; }
+    void operator()( const tbb::blocked_range<int>& range ) {
+        for (int j = range.begin(); j != range.end(); ++j)
+            sum += my_a[j] * my_b[i - j];
+    }
+};
+
+#if _MSC_VER && !defined(__INTEL_COMPILER)
+    #pragma warning( pop )
+#endif
+
+//! Test OpenMMP loop around TBB loop
+void OpenMP_TBB_Convolve( data_type c[], const data_type a[], int m, const data_type b[], int n ) {
+#pragma omp parallel
+    {
+#pragma omp for
+        for (int i = 0; i < m + n - 1; ++i) {
+            int start = i < n ? 0 : i - n + 1;
+            int finish = i < m ? i + 1 : m;
+            InnerBody body(c, a, b, i);
+            tbb::parallel_reduce(tbb::blocked_range<int>(start, finish, 10), body);
+            c[i] = body.sum;
+        }
+    }
+}
+
+class OuterBody: utils::NoAssign {
+    const data_type* my_a;
+    const data_type* my_b;
+    data_type* my_c;
+    const int m;
+    const int n;
+public:
+    OuterBody( data_type c[], const data_type a[], int m_, const data_type b[], int n_ ) :
+        my_a(a), my_b(b), my_c(c), m(m_), n(n_)
+    {}
+    void operator()( const tbb::blocked_range<int>& range ) const {
+        for (int i = range.begin(); i != range.end(); ++i) {
+            int start = i < n ? 0 : i - n + 1;
+            int finish = i < m ? i + 1 : m;
+            data_type sum = 0;
+#pragma omp parallel for reduction(+:sum)
+            for (int j = start; j < finish; ++j)
+                sum += my_a[j] * my_b[i - j];
+            my_c[i] = sum;
+        }
+    }
+};
+
+//! Test TBB loop around OpenMP loop
+void TBB_OpenMP_Convolve( data_type c[], const data_type a[], int m, const data_type b[], int n ) {
+    tbb::parallel_for(tbb::blocked_range<int>(0, m + n - 1, 10), OuterBody(c, a, m, b, n));
+}
+
+#if __INTEL_COMPILER
+void TestNumThreads() {
+    utils::SetEnv("KMP_AFFINITY", "compact");
+    // Make an OpenMP call before initializing TBB
+    int omp_nthreads = omp_get_max_threads();
+    #pragma omp parallel
+    {}
+    int tbb_nthreads = tbb::this_task_arena::max_concurrency();
+    // For the purpose of testing, assume that OpenMP and TBB should utilize the same # of threads.
+    // If it's not true on some platforms, the test will need to be adjusted.
+    REQUIRE_MESSAGE(tbb_nthreads == omp_nthreads, "Initialization of TBB is possibly affected by OpenMP");
+}
+#endif // __INTEL_COMPILER
+
+const int M = 17 * 17;
+const int N = 13 * 13;
+data_type A[M], B[N];
+data_type expected[M+N], actual[M+N];
+
+template <class Func>
+void RunTest( Func F, int m, int n, std::size_t p) {
+    tbb::global_control limit(tbb::global_control::max_allowed_parallelism, p);
+    memset(actual, -1, (m + n) * sizeof(data_type));
+    F(actual, A, m, B, n);
+    CHECK(memcmp(actual, expected, (m + n - 1) * sizeof(data_type)) == 0);
+}
+
+//! \brief \ref error_guessing
+TEST_CASE("Testing oneTBB with OpenMP") {
+#if __INTEL_COMPILER
+    TestNumThreads(); // Testing initialization-related behavior; must be the first
+#endif // __INTEL_COMPILER
+
+    for (std::size_t p = utils::MinThread; p <= utils::MaxThread; ++p) {
+        for (std::size_t m = 1; m <= M; m *= 17) {
+            for (std::size_t n = 1; n <= N; n *= 13) {
+                for (std::size_t i = 0; i < m; ++i) A[i] = data_type(1 + i / 5);
+                for (std::size_t i = 0; i < n; ++i) B[i] = data_type(1 + i / 7);
+                SerialConvolve( expected, A, m, B, n );
+                RunTest( OpenMP_TBB_Convolve, m, n, p );
+                RunTest( TBB_OpenMP_Convolve, m, n, p );
+            }
+        }
+    }
+}
diff --git a/test/tbb/test_overwrite_node.cpp b/test/tbb/test_overwrite_node.cpp
index e1fa3eb565..3d1981a9b2 100644
--- a/test/tbb/test_overwrite_node.cpp
+++ b/test/tbb/test_overwrite_node.cpp
@@ -225,3 +225,12 @@ TEST_CASE("Deduction guides"){
 }
 #endif
 
+//! Test try_release
+//! \brief \ref error_guessing
+TEST_CASE("try_release"){
+    tbb::flow::graph g;
+
+    tbb::flow::overwrite_node<int> on(g);
+
+    CHECK_MESSAGE ((on.try_release()== true), "try_release should return true");
+}
diff --git a/test/tbb/test_parallel_pipeline.cpp b/test/tbb/test_parallel_pipeline.cpp
index 25e01cd668..4ba658fb42 100644
--- a/test/tbb/test_parallel_pipeline.cpp
+++ b/test/tbb/test_parallel_pipeline.cpp
@@ -457,7 +457,7 @@ void run_filter_set(
         resetCounters();
         tbb::parallel_pipeline( n_tokens, copy123, context...  );
         checkCounters(my_t);
-        tbb::filter<void, void> move123( filter123 );
+        tbb::filter<void, void> move123( std::move(filter123) );
         resetCounters();
         tbb::parallel_pipeline( n_tokens, move123, context...  );
         checkCounters(my_t);
diff --git a/test/tbb/test_queue_node.cpp b/test/tbb/test_queue_node.cpp
index 68a86b6155..8b48729642 100644
--- a/test/tbb/test_queue_node.cpp
+++ b/test/tbb/test_queue_node.cpp
@@ -530,3 +530,28 @@ TEST_CASE("Deduction guides"){
 }
 #endif
 
+//! Test operations on a reserved queue_node
+//! \brief \ref error_guessing
+TEST_CASE("queue_node with reservation"){
+    tbb::flow::graph g;
+
+    tbb::flow::queue_node<int> q(g);
+
+    bool res = q.try_put(42);
+    CHECK_MESSAGE( res, "queue_node must accept input." );
+
+    int val = 1;
+    res = q.try_reserve(val);
+    CHECK_MESSAGE( res, "queue_node must reserve as it has an item." );
+    CHECK_MESSAGE( (val == 42), "queue_node must reserve once passed item." );
+
+    int out_arg = -1;
+    CHECK_MESSAGE((q.try_reserve(out_arg) == false), "Reserving a reserved node should fail.");
+    CHECK_MESSAGE((out_arg == -1), "Reserving a reserved node should not update its argument.");
+
+    out_arg = -1;
+    CHECK_MESSAGE((q.try_get(out_arg) == false), "Getting from reserved node should fail.");
+    CHECK_MESSAGE((out_arg == -1), "Getting from reserved node should not update its argument.");
+    g.wait_for_all();
+    
+}
diff --git a/test/tbb/test_task.cpp b/test/tbb/test_task.cpp
index 57ee5e535f..6c4f2dc443 100644
--- a/test/tbb/test_task.cpp
+++ b/test/tbb/test_task.cpp
@@ -18,12 +18,14 @@
 #include "common/utils.h"
 #include "common/spin_barrier.h"
 #include "common/utils_concurrency_limit.h"
+#include "common/cpu_usertime.h"
 
 #include "tbb/task.h"
 #include "tbb/task_group.h"
 #include "tbb/parallel_for.h"
 #include "tbb/cache_aligned_allocator.h"
 #include "tbb/global_control.h"
+#include "tbb/concurrent_vector.h"
 
 #include <atomic>
 #include <thread>
@@ -110,6 +112,7 @@ void test_cancellation_on_exception( bool reset_ctx ) {
         }
         wait.reserve(1);
     }
+    wait.release(1);
 
     REQUIRE_MESSAGE(task.execute_counter() == (reset_ctx ? iter_counter : 1), "Some task was not executed");
     REQUIRE_MESSAGE(task.cancel_counter() == iter_counter, "Some task was not canceled after the exception occurs");
@@ -129,6 +132,8 @@ TEST_CASE("Test that task was executed p times") {
         wait.reserve(1);
     }
 
+    wait.release(1);
+
     REQUIRE_MESSAGE(CountingTask<>::execute_counter() == iter_counter, "The task was not executed necessary times");
     REQUIRE_MESSAGE(CountingTask<>::cancel_counter() == 0, "Some instance of the task was canceled");
     CountingTask<>::reset();
@@ -165,6 +170,7 @@ TEST_CASE("Simple test parallelism usage") {
         tbb::detail::d1::wait(wait, test_context);
         wait.reserve(threads_num);
     }
+    wait.release(threads_num);
 
     REQUIRE_MESSAGE(task_type::execute_counter() == iter_counter * threads_num, "Some task was not executed");
     REQUIRE_MESSAGE(task_type::cancel_counter() == 0, "Some task was canceled");
@@ -221,6 +227,7 @@ TEST_CASE("Test parallelism usage with parallel_for") {
 
         wait.reserve(task_threads_num);
     }
+    wait.release(task_threads_num);
 
     REQUIRE_MESSAGE(task_type::execute_counter() == task_threads_num * iter_count, "Some task was not executed");
     REQUIRE_MESSAGE(task_type::cancel_counter() == 0, "Some task was canceled");
@@ -263,6 +270,7 @@ TEST_CASE("Test parallelism usage with spawn tasks in different threads") {
         tbb::detail::d1::execute_and_wait(vector_test_task[threads_num - 1], test_context, wait, test_context);
         wait.reserve(threads_num);
     }
+    wait.release(threads_num);
 
     REQUIRE_MESSAGE(task_type::execute_counter() == iter_count * threads_num, "Some task was not executed");
     REQUIRE_MESSAGE(task_type::cancel_counter() == 0, "Some task was canceled");
@@ -372,8 +380,8 @@ TEST_CASE("Isolation + resumable tasks") {
                 std::vector<suspended_task, tbb::cache_aligned_allocator<suspended_task>> test_task;
                 tbb::detail::d1::wait_context wait(1);
                 ++suspend_count;
-                tbb::task::suspend([&wait, &test_context, &test_task] (tbb::task::suspend_point tag) {
-                    tbb::this_task_arena::isolate([&wait, &test_context, &test_task, &tag] {
+                tbb::this_task_arena::isolate([&wait, &test_context, &test_task] {
+                    tbb::task::suspend([&wait, &test_context, &test_task] (tbb::task::suspend_point tag) {
                         test_task.emplace_back(tag, wait);
                         tbb::detail::d1::spawn(test_task[0], test_context);
                     });
@@ -576,6 +584,7 @@ TEST_CASE("Stress testing") {
             tbb::detail::d1::wait(wait, test_context);
             wait.reserve(task_number);
         }
+        wait.release(task_number);
     });
 
 
@@ -583,6 +592,52 @@ TEST_CASE("Stress testing") {
     REQUIRE_MESSAGE(task_type::cancel_counter() == 0, "Some task was canceled");
 }
 
+//! \brief \ref error_guessing
+TEST_CASE("All workers sleep") {
+    std::size_t thread_number = utils::get_platform_max_threads();
+    tbb::concurrent_vector<tbb::task::suspend_point> suspend_points;
+
+    tbb::task_group test_gr;
+
+    utils::SpinBarrier barrier(thread_number);
+    auto resumble_task = [&] {
+        barrier.wait();
+        tbb::task::suspend([&] (tbb::task::suspend_point sp) {
+            suspend_points.push_back(sp);
+            barrier.wait();
+        });
+    };
+
+    for (std::size_t i = 0; i < thread_number - 1; ++i) {
+        test_gr.run(resumble_task);
+    }
+
+    barrier.wait();
+    barrier.wait();
+    TestCPUUserTime(thread_number);
+
+    for (auto sp : suspend_points)
+        tbb::task::resume(sp);
+    test_gr.wait();
+}
+
+//! \brief \ref error_guessing
+TEST_CASE("External threads sleep") {
+    if (utils::get_platform_max_threads() < 2) return;
+    utils::SpinBarrier barrier(2);
+
+    tbb::task_group test_gr;
+
+    test_gr.run([&] {
+        barrier.wait();
+        TestCPUUserTime(2);
+    });
+
+    barrier.wait();
+
+    test_gr.wait();
+}
+
 #endif // __TBB_RESUMABLE_TASKS
 
 //! \brief \ref error_guessing
@@ -644,6 +699,7 @@ TEST_CASE("Enqueue with exception") {
             wait.reserve(task_number);
         });
     }
+    wait.release(task_number);
 
 
     REQUIRE_MESSAGE(task_type::execute_counter() == task_number * iter_count, "Some task was not executed");
diff --git a/test/tbb/test_task_arena.cpp b/test/tbb/test_task_arena.cpp
index 6fc9c28330..e514196b28 100644
--- a/test/tbb/test_task_arena.cpp
+++ b/test/tbb/test_task_arena.cpp
@@ -20,18 +20,23 @@
 #include "common/spin_barrier.h"
 #include "common/utils.h"
 #include "common/utils_report.h"
+#include "common/utils_concurrency_limit.h"
 
 #include "tbb/task_arena.h"
 #include "tbb/task_scheduler_observer.h"
 #include "tbb/enumerable_thread_specific.h"
 #include "tbb/parallel_for.h"
 #include "tbb/global_control.h"
+#include "tbb/concurrent_set.h"
+#include "tbb/spin_mutex.h"
+#include "tbb/spin_rw_mutex.h"
 
 #include <stdexcept>
 #include <cstdlib>
 #include <cstdio>
 #include <vector>
 #include <thread>
+#include <atomic>
 
 //#include "harness_fp.h"
 
@@ -303,7 +308,7 @@ struct TestArenaEntryBody : FPModeContext {
     :   FPModeContext(idx+i)
     ,   my_stage(s)
     ,   is_caught(false)
-#if TBB_USE_EXCEPTION
+#if TBB_USE_EXCEPTIONS
     ,   is_expected( (idx&(1<<i)) != 0 )
 #else
     , is_expected(false)
@@ -1411,6 +1416,281 @@ void TestDefaultWorkersLimit() {
 #endif
 }
 
+#if TBB_USE_EXCEPTIONS
+
+void ExceptionInExecute() {
+    std::size_t thread_number = utils::get_platform_max_threads();
+    tbb::task_arena test_arena(thread_number / 2, thread_number / 2);
+
+    std::atomic<int> canceled_task{};
+
+    auto parallel_func = [&test_arena, &canceled_task] (std::size_t) {
+        for (std::size_t i = 0; i < 1000; ++i) {
+            try {
+                test_arena.execute([] {
+                    volatile bool suppress_unreachable_code_warning = true;
+                    if (suppress_unreachable_code_warning) {
+                        throw -1;
+                    }
+                });
+                FAIL("An exception should have thrown.");
+            } catch (int) {
+                ++canceled_task;
+            } catch (...) {
+                FAIL("Wrong type of exception.");
+            }
+        }
+    };
+
+    utils::NativeParallelFor(thread_number, parallel_func);
+    CHECK(canceled_task == thread_number * 1000);
+}
+
+#endif // TBB_USE_EXCEPTIONS
+
+class simple_observer : public tbb::task_scheduler_observer {
+    static std::atomic<int> idx_counter;
+    int my_idx;
+    int myMaxConcurrency;   // concurrency of the associated arena
+    int myNumReservedSlots; // reserved slots in the associated arena
+    void on_scheduler_entry( bool is_worker ) override {
+        int current_index = tbb::this_task_arena::current_thread_index();
+        CHECK(current_index < (myMaxConcurrency > 1 ? myMaxConcurrency : 2));
+        if (is_worker) {
+            CHECK(current_index >= myNumReservedSlots);
+        }
+    }
+    void on_scheduler_exit( bool /*is_worker*/ ) override
+    {}
+public:
+    simple_observer(tbb::task_arena &a, int maxConcurrency, int numReservedSlots)
+        : tbb::task_scheduler_observer(a), my_idx(idx_counter++)
+        , myMaxConcurrency(maxConcurrency)
+        , myNumReservedSlots(numReservedSlots) {
+        observe(true);
+    }
+
+    friend bool operator<(const simple_observer& lhs, const simple_observer& rhs) {
+        return lhs.my_idx < rhs.my_idx;
+    }
+};
+
+std::atomic<int> simple_observer::idx_counter{};
+
+struct arena_handler {
+    enum arena_status {
+        alive,
+        deleting,
+        deleted
+    };
+
+    tbb::task_arena* arena;
+
+    std::atomic<arena_status> status{alive};
+    tbb::spin_rw_mutex arena_in_use{};
+
+    tbb::concurrent_set<simple_observer> observers;
+
+    arena_handler(tbb::task_arena* ptr) : arena(ptr)
+    {}
+
+    friend bool operator<(const arena_handler& lhs, const arena_handler& rhs) {
+        return lhs.arena < rhs.arena;
+    }
+};
+
+// TODO: Add observer operations
+void StressTestMixFunctionality() {
+    enum operation_type {
+        create_arena,
+        delete_arena,
+        attach_observer,
+        detach_observer,
+        arena_execute,
+        enqueue_task,
+        last_operation_marker
+    };
+
+    std::size_t operations_number = last_operation_marker;
+    std::size_t thread_number = utils::get_platform_max_threads();
+    utils::FastRandom<> operation_rnd(42);
+    tbb::spin_mutex random_operation_guard;
+
+    auto get_random_operation = [&operation_rnd, &random_operation_guard, operations_number] () {
+        tbb::spin_mutex::scoped_lock lock(random_operation_guard);
+        return static_cast<operation_type>(operation_rnd.get() % operations_number);
+    };
+
+    utils::FastRandom<> arena_rnd(42);
+    tbb::spin_mutex random_arena_guard;
+    auto get_random_arena = [&arena_rnd, &random_arena_guard] () {
+        tbb::spin_mutex::scoped_lock lock(random_arena_guard);
+        return arena_rnd.get();
+    };
+
+    tbb::concurrent_set<arena_handler> arenas_pool;
+
+    std::vector<std::thread> thread_pool;
+
+    utils::SpinBarrier thread_barrier(thread_number);
+    std::size_t max_operations = 100000;
+    std::atomic<std::size_t> curr_operation{};
+    auto thread_func = [&] () {
+        arenas_pool.emplace(new tbb::task_arena());
+        thread_barrier.wait();
+        while (curr_operation++ < max_operations) {
+            switch (get_random_operation()) {
+                case create_arena :
+                {
+                    arenas_pool.emplace(new tbb::task_arena());
+                    break;
+                }
+                case delete_arena :
+                {
+                    auto curr_arena = arenas_pool.begin();
+                    for (; curr_arena != arenas_pool.end(); ++curr_arena) {
+                        arena_handler::arena_status curr_status = arena_handler::alive;
+                        if (curr_arena->status.compare_exchange_strong(curr_status, arena_handler::deleting)) {
+                            break;
+                        }
+                    }
+
+                    if (curr_arena == arenas_pool.end()) break;
+
+                    tbb::spin_rw_mutex::scoped_lock lock(curr_arena->arena_in_use, /*writer*/ true);
+
+                    delete curr_arena->arena;
+                    curr_arena->status.store(arena_handler::deleted);
+
+                    break;
+                }
+                case attach_observer :
+                {
+                    tbb::spin_rw_mutex::scoped_lock lock{};
+                    auto curr_arena = arenas_pool.begin();
+                    for (; curr_arena != arenas_pool.end(); ++curr_arena) {
+                        if (lock.try_acquire(curr_arena->arena_in_use, /*writer*/ false)) {
+                            if (curr_arena->status == arena_handler::alive) {
+                                break;
+                            } else {
+                                lock.release();
+                            }
+                        }
+                    }
+
+                    if (curr_arena == arenas_pool.end()) break;
+
+                    {
+                        curr_arena->observers.emplace(*curr_arena->arena, thread_number, 1);
+                    }
+
+                    break;
+                }
+                case detach_observer :
+                {
+                    auto arena_number = get_random_arena() % arenas_pool.size();
+                    auto curr_arena = arenas_pool.begin();
+                    std::advance(curr_arena, arena_number);
+
+                    for (auto it = curr_arena->observers.begin(); it != curr_arena->observers.end(); ++it) {
+                        if (it->is_observing()) {
+                            it->observe(false);
+                            break;
+                        }
+                    }
+
+                    break;
+                }
+                case arena_execute :
+                {
+                    tbb::spin_rw_mutex::scoped_lock lock{};
+                    auto curr_arena = arenas_pool.begin();
+                    for (; curr_arena != arenas_pool.end(); ++curr_arena) {
+                        if (lock.try_acquire(curr_arena->arena_in_use, /*writer*/ false)) {
+                            if (curr_arena->status == arena_handler::alive) {
+                                break;
+                            } else {
+                                lock.release();
+                            }
+                        }
+                    }
+
+                    if (curr_arena == arenas_pool.end()) break;
+
+                    curr_arena->arena->execute([] () {
+                        tbb::parallel_for(tbb::blocked_range<std::size_t>(0, 10000), [] (tbb::blocked_range<std::size_t>&) {
+                            std::atomic<int> sum{};
+                            // Make some work
+                            for (; sum < 100; ++sum) ;
+                        });
+                    });
+
+                    break;
+                }
+                case enqueue_task :
+                {
+                    tbb::spin_rw_mutex::scoped_lock lock{};
+                    auto curr_arena = arenas_pool.begin();
+                    for (; curr_arena != arenas_pool.end(); ++curr_arena) {
+                        if (lock.try_acquire(curr_arena->arena_in_use, /*writer*/ false)) {
+                            if (curr_arena->status == arena_handler::alive) {
+                                break;
+                            } else {
+                                lock.release();
+                            }
+                        }
+                    }
+
+                    if (curr_arena == arenas_pool.end()) break;
+
+                    curr_arena->arena->enqueue([] {
+                        std::atomic<int> sum{};
+                        // Make some work
+                        for (; sum < 100000; ++sum) ;
+                    });
+
+                    break;
+                }
+                case last_operation_marker :
+                break;
+            }
+        }
+    };
+
+    for (std::size_t i = 0; i < thread_number - 1; ++i) {
+        thread_pool.emplace_back(thread_func);
+    }
+
+    thread_func();
+
+    for (std::size_t i = 0; i < thread_number - 1; ++i) {
+        if (thread_pool[i].joinable()) thread_pool[i].join();
+    }
+
+    for (auto& handler : arenas_pool) {
+        if (handler.status != arena_handler::deleted) delete handler.arena;
+    }
+}
+
+struct enqueue_test_helper {
+    enqueue_test_helper(tbb::task_arena& arena, tbb::enumerable_thread_specific<bool>& ets , std::atomic<std::size_t>& task_counter)
+        : my_arena(arena), my_ets(ets), my_task_counter(task_counter)
+    {}
+
+    enqueue_test_helper(const enqueue_test_helper& ef) : my_arena(ef.my_arena), my_ets(ef.my_ets), my_task_counter(ef.my_task_counter)
+    {}
+
+    void operator() () const {
+        CHECK(my_ets.local());
+        if (my_task_counter++ < 100000) my_arena.enqueue(enqueue_test_helper(my_arena, my_ets, my_task_counter));
+        std::this_thread::yield();
+    }
+
+    tbb::task_arena& my_arena;
+    tbb::enumerable_thread_specific<bool>& my_ets;
+    std::atomic<std::size_t>& my_task_counter;
+};
+
 //--------------------------------------------------//
 
 //! Test for task arena in concurrent cases
@@ -1487,3 +1767,52 @@ TEST_CASE("Multiple waits") {
 TEST_CASE("Small stack size") {
     TestSmallStackSize();
 }
+
+#if TBB_USE_EXCEPTIONS
+//! \brief \ref requirement \ref stress
+TEST_CASE("Test for exceptions during execute.") {
+    ExceptionInExecute();
+}
+#endif // TBB_USE_EXCEPTIONS
+
+//! \brief \ref stress
+TEST_CASE("Stress test with mixing functionality") {
+    StressTestMixFunctionality();
+}
+
+//! \brief \ref stress
+TEST_CASE("Workers oversubscription") {
+    std::size_t num_threads = utils::get_platform_max_threads();
+    tbb::enumerable_thread_specific<bool> ets;
+    tbb::global_control gl(tbb::global_control::max_allowed_parallelism, num_threads * 2);
+    tbb::task_arena arena(num_threads * 2);
+
+    utils::SpinBarrier barrier(num_threads * 2);
+
+    arena.execute([&] {
+        tbb::parallel_for(std::size_t(0), num_threads * 2,
+            [&] (const std::size_t&) {
+                ets.local() = true;
+                barrier.wait();
+            }
+        );
+    });
+
+    std::this_thread::yield();
+
+    std::atomic<std::size_t> task_counter{0};
+    for (std::size_t i = 0; i < num_threads / 4 + 1; ++i) {
+        arena.enqueue(enqueue_test_helper(arena, ets, task_counter));
+    }
+
+    while (task_counter < 100000) std::this_thread::yield();
+
+    arena.execute([&] {
+        tbb::parallel_for(std::size_t(0), num_threads * 2, 
+            [&] (const std::size_t&) {
+                CHECK(ets.local());
+                barrier.wait();
+            }
+        );
+    });
+}
diff --git a/test/tbb/test_task_group.cpp b/test/tbb/test_task_group.cpp
index 8140efa1a6..d81b7721a1 100644
--- a/test/tbb/test_task_group.cpp
+++ b/test/tbb/test_task_group.cpp
@@ -477,15 +477,15 @@ void LaunchChildrenWithFunctor () {
     bool exceptionCaught = false;
     try {
         status = g.wait();
-    }
-    catch ( TestException& e ) {
+    } catch ( TestException& e ) {
         CHECK_MESSAGE( e.what(), "Empty what() string" );
         CHECK_MESSAGE( strcmp(e.what(), EXCEPTION_DESCR1) == 0, "Unknown exception" );
         exceptionCaught = true;
         ++g_ExceptionCount;
     } catch( ... ) { CHECK_MESSAGE( false, "Unknown exception" ); }
-    if (g_Throw && !exceptionCaught && status != tbb::canceled)
-        CHECK_MESSAGE( false, "No exception in the child task group" );
+    if (g_Throw && !exceptionCaught && status != tbb::canceled) {
+        CHECK_MESSAGE(false, "No exception in the child task group");
+    }
     if ( g_Rethrow && g_ExceptionCount > SKIP_GROUPS ) {
         throw test_exception(EXCEPTION_DESCR2);
     }
@@ -515,7 +515,7 @@ void TestManualCancellationWithFunctor () {
     CHECK_MESSAGE( g_TaskCount <= g_ExecutedAtCancellation + utils::ConcurrencyTracker::PeakParallelism(), "Too many tasks survived cancellation" );
 }
 
-#if TBB_USE_EXCEPTION
+#if TBB_USE_EXCEPTIONS
 template<typename task_group_type>
 void TestExceptionHandling1 () {
     ResetGlobals( true, false );
@@ -555,6 +555,23 @@ void TestExceptionHandling2 () {
     CHECK_MESSAGE( g_ExceptionCount < NUM_GROUPS - SKIP_GROUPS, "None of the child groups was cancelled" );
 }
 
+template <typename task_group_type>
+void TestExceptionHandling3() {
+    task_group_type tg;
+    try {
+        tg.run_and_wait([]() {
+            volatile bool suppress_unreachable_code_warning = true;
+            if (suppress_unreachable_code_warning) {
+                throw 1;
+            }
+        });
+    } catch (int error) {
+        CHECK(error == 1);
+    } catch ( ... ) {
+        CHECK_MESSAGE( false, "Unexpected exception" );
+    }
+}
+
 template<typename task_group_type>
 class LaunchChildrenDriver {
 public:
@@ -584,8 +601,10 @@ void TestMissingWait () {
     try {
         task_group_type tg;
         driver.Launch( tg );
-        if ( Throw )
+        volatile bool suppress_unreachable_code_warning = Throw;
+        if (suppress_unreachable_code_warning) {
             throw int(); // Initiate stack unwinding
+        }
     }
     catch ( const tbb::missing_wait& e ) {
         CHECK_MESSAGE( e.what(), "Error message is absent" );
@@ -608,9 +627,10 @@ void TestMissingWait () {
 template<typename task_group_type>
 void RunCancellationAndExceptionHandlingTests() {
     TestManualCancellationWithFunctor<task_group_type>();
-#if TBB_USE_EXCEPTION
+#if TBB_USE_EXCEPTIONS
     TestExceptionHandling1<task_group_type>();
     TestExceptionHandling2<task_group_type>();
+    TestExceptionHandling3<task_group_type>();
     TestMissingWait<task_group_type, true>();
     TestMissingWait<task_group_type, false>();
 #endif
@@ -767,7 +787,7 @@ TEST_CASE("Fibonacci test for the task group") {
 //! Cancellation and exception test for the task group
 //! \brief \ref interface \ref requirement
 TEST_CASE("Cancellation and exception test for the task group") {
-    for (unsigned p=MinThread; p <= MaxThread; ++p) {
+    for (unsigned p = MinThread; p <= MaxThread; ++p) {
         tbb::global_control limit(tbb::global_control::max_allowed_parallelism, p);
         g_MaxConcurrency = p;
         RunCancellationAndExceptionHandlingTests<tbb::task_group>();
@@ -944,3 +964,33 @@ TEST_CASE("Test for stack overflow avoidance mechanism within arena") {
     });
     CHECK(tasks_executed == 10000 + second_thread_executed);
 }
+
+//! Test checks that we can submit work to task_group asynchronously with waiting.
+//! \brief \ref regression
+TEST_CASE("Async task group") {
+    int num_threads = tbb::this_task_arena::max_concurrency();
+    tbb::task_arena a(2*num_threads, num_threads);
+    utils::SpinBarrier barrier(num_threads + 2);
+    tbb::task_group tg[2];
+    std::atomic<bool> finished[2]{};
+    finished[0] = false; finished[1] = false;
+    for (int i = 0; i < 2; ++i) {
+        a.enqueue([i, &tg, &finished, &barrier] {
+            barrier.wait();
+            for (int j = 0; j < 10000; ++j) {
+                tg[i].run([] {});
+                std::this_thread::yield();
+            }
+            finished[i] = true;
+        });
+    }
+    utils::NativeParallelFor(num_threads, [&](int idx) {
+        barrier.wait();
+        a.execute([idx, &tg, &finished] {
+            while (!finished[idx%2]) {
+                tg[idx%2].wait();
+            }
+            tg[idx%2].wait();
+        });
+    });
+}
diff --git a/test/tbb/test_tbb_header.cpp b/test/tbb/test_tbb_header.cpp
index 76bf7cf09c..1317861c20 100644
--- a/test/tbb/test_tbb_header.cpp
+++ b/test/tbb/test_tbb_header.cpp
@@ -32,7 +32,6 @@
 #define TBB_PREVIEW_VARIADIC_PARALLEL_INVOKE 1
 #define TBB_PREVIEW_BLOCKED_RANGE_ND 1
 #define TBB_PREVIEW_ISOLATED_TASK_GROUP 1
-#define TBB_PREVIEW_NUMA_SUPPORT 1
 #endif
 
 #if __TBB_TEST_SECONDARY
diff --git a/test/tbbmalloc/test_malloc_overload.cpp b/test/tbbmalloc/test_malloc_overload.cpp
index cf8da63afa..d04f475afa 100644
--- a/test/tbbmalloc/test_malloc_overload.cpp
+++ b/test/tbbmalloc/test_malloc_overload.cpp
@@ -127,7 +127,7 @@ using namespace std;
 #endif
 
 #include "tbb/detail/_utils.h"  // tbb::detail::is_aligned
-#include "../src/tbbmalloc/shared_utils.h"  // alignDown, alignUp, estimatedCacheLineSize
+#include "src/tbbmalloc/shared_utils.h"  // alignDown, alignUp, estimatedCacheLineSize
 
 /* start of code replicated from src/tbbmalloc */