diff --git a/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h b/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h
new file mode 100644
index 00000000000..a70c1853e1a
--- /dev/null
+++ b/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h
@@ -0,0 +1,183 @@
+/* file: mrg32k3a.h */
+/*******************************************************************************
+* Copyright contributors to the oneDAL project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/*
+//++
+//  Implementation of the MRG32k3a engine: a 32-bit combined multiple recursive generator
+//  with two components of order 3, optimized for batch processing.
+//--
+*/
+
+#ifndef __MRG32K3A_H__
+#define __MRG32K3A_H__
+
+#include "algorithms/engines/mrg32k3a/mrg32k3a_types.h"
+#include "algorithms/engines/engine.h"
+
+namespace daal
+{
+namespace algorithms
+{
+namespace engines
+{
+namespace mrg32k3a
+{
+/**
+ * @defgroup engines_mrg32k3a_batch Batch
+ * @ingroup engines_mrg32k3a
+ * @{
+ */
+namespace interface1
+{
+/**
+ * <a name="DAAL-CLASS-ALGORITHMS__ENGINES__mrg32k3a__BATCHCONTAINER"></a>
+ * \brief Provides methods to run implementations of the mrg32k3a engine.
+ *        This class is associated with the \ref mrg32k3a::interface1::Batch "mrg32k3a::Batch" class
+ *        and supports the method of mrg32k3a engine computation in the batch processing mode
+ *
+ * \tparam algorithmFPType  Data type to use in intermediate computations of mrg32k3a engine, double or float
+ * \tparam method           Computation method of the engine, mrg32k3a::Method
+ * \tparam cpu              Version of the cpu-specific implementation of the engine, daal::CpuType
+ */
+template <typename algorithmFPType, Method method, CpuType cpu>
+class BatchContainer : public daal::algorithms::AnalysisContainerIface<batch>
+{
+public:
+    /**
+     * Constructs a container for the mrg32k3a engine with a specified environment
+     * in the batch processing mode
+     * \param[in] daalEnv   Environment object
+     */
+    BatchContainer(daal::services::Environment::env * daalEnv);
+    ~BatchContainer();
+    /**
+     * Computes the result of the mrg32k3a engine in the batch processing mode
+     *
+     * \return Status of computations
+     */
+    services::Status compute() DAAL_C11_OVERRIDE;
+};
+
+/**
+ * <a name="DAAL-CLASS-ALGORITHMS__ENGINES__mrg32k3a__BATCH"></a>
+ * \brief Provides methods for mrg32k3a engine computations in the batch processing mode
+ *
+ * \tparam algorithmFPType  Data type to use in intermediate computations of mrg32k3a engine, double or float
+ * \tparam method           Computation method of the engine, mrg32k3a::Method
+ *
+ * \par Enumerations
+ *      - mrg32k3a::Method          Computation methods for the mrg32k3a engine
+ *
+ * \par References
+ *      - \ref engines::interface1::Input  "engines::Input" class
+ *      - \ref engines::interface1::Result "engines::Result" class
+ */
+template <typename algorithmFPType = DAAL_ALGORITHM_FP_TYPE, Method method = defaultDense>
+class DAAL_EXPORT Batch : public engines::BatchBase
+{
+public:
+    typedef engines::BatchBase super;
+
+    typedef typename super::InputType InputType;
+    typedef typename super::ResultType ResultType;
+
+    /**
+     * Creates mrg32k3a engine
+     * \param[in] seed  Initial condition for mrg32k3a engine
+     *
+     * \return Pointer to mrg32k3a engine
+     */
+    static services::SharedPtr<Batch<algorithmFPType, method> > create(size_t seed = 777);
+
+    /**
+     * Returns method of the engine
+     * \return Method of the engine
+     */
+    virtual int getMethod() const DAAL_C11_OVERRIDE { return (int)method; }
+
+    /**
+     * Returns the structure that contains results of mrg32k3a engine
+     * \return Structure that contains results of mrg32k3a engine
+     */
+    ResultPtr getResult() { return _result; }
+
+    /**
+     * Registers user-allocated memory to store results of mrg32k3a engine
+     * \param[in] result  Structure to store results of mrg32k3a engine
+     *
+     * \return Status of computations
+     */
+    services::Status setResult(const ResultPtr & result)
+    {
+        DAAL_CHECK(result, services::ErrorNullResult)
+        _result = result;
+        _res    = _result.get();
+        return services::Status();
+    }
+
+    /**
+     * Returns a pointer to the newly allocated mrg32k3a engine
+     * with a copy of input objects and parameters of this mrg32k3a engine
+     * \return Pointer to the newly allocated engine
+     */
+    services::SharedPtr<Batch<algorithmFPType, method> > clone() const { return services::SharedPtr<Batch<algorithmFPType, method> >(cloneImpl()); }
+
+    /**
+     * Allocates memory to store the result of the mrg32k3a engine
+     *
+     * \return Status of computations
+     */
+    virtual services::Status allocateResult() DAAL_C11_OVERRIDE
+    {
+        services::Status s = this->_result->template allocate<algorithmFPType>(&(this->input), NULL, (int)method);
+        this->_res         = this->_result.get();
+        return s;
+    }
+
+protected:
+    Batch(size_t seed = 777) { initialize(); }
+
+    Batch(const Batch<algorithmFPType, method> & other) : super(other) { initialize(); }
+
+    virtual Batch<algorithmFPType, method> * cloneImpl() const DAAL_C11_OVERRIDE { return new Batch<algorithmFPType, method>(*this); }
+
+    void initialize()
+    {
+        Analysis<batch>::_ac = new __DAAL_ALGORITHM_CONTAINER(batch, BatchContainer, algorithmFPType, method)(&_env);
+        _in                  = &input;
+        _result.reset(new ResultType());
+    }
+
+private:
+    ResultPtr _result;
+
+    Batch & operator=(const Batch &);
+};
+typedef services::SharedPtr<Batch<> > mrg32k3aPtr;
+typedef services::SharedPtr<const Batch<> > mrg32k3aConstPtr;
+
+} // namespace interface1
+using interface1::BatchContainer;
+using interface1::Batch;
+using interface1::mrg32k3aPtr;
+using interface1::mrg32k3aConstPtr;
+/** @} */
+} // namespace mrg32k3a
+} // namespace engines
+} // namespace algorithms
+} // namespace daal
+#endif
diff --git a/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a_types.h b/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a_types.h
new file mode 100644
index 00000000000..8fdc58b98c8
--- /dev/null
+++ b/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a_types.h
@@ -0,0 +1,65 @@
+/* file: mrg32k3a_types.h */
+/*******************************************************************************
+* Copyright contributors to the oneDAL project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/*
+//++
+//  Implementation of the MRG32k3a engine: a 32-bit combined multiple recursive generator
+//  with two components of order 3, optimized for batch processing.
+//--
+*/
+
+#ifndef __MRG32K3A_TYPES_H__
+#define __MRG32K3A_TYPES_H__
+
+#include "algorithms/algorithm.h"
+#include "services/daal_defines.h"
+#include "data_management/data/numeric_table.h"
+#include "data_management/data/homogen_numeric_table.h"
+
+namespace daal
+{
+namespace algorithms
+{
+namespace engines
+{
+/**
+ * @defgroup engines_mrg32k3a mrg32k3a Engine
+ * \copydoc daal::algorithms::engines::mrg32k3a
+ * @ingroup engines
+ * @{
+ */
+/**
+ * \brief Contains classes for mrg32k3a engine
+ */
+namespace mrg32k3a
+{
+/**
+ * <a name="DAAL-ENUM-ALGORITHMS__ENGINES__mrg32k3a__METHOD"></a>
+ * Available methods to compute mrg32k3a engine
+ */
+enum Method
+{
+    defaultDense = 0 /*!< Default: performance-oriented method. */
+};
+
+} // namespace mrg32k3a
+/** @} */
+} // namespace engines
+} // namespace algorithms
+} // namespace daal
+
+#endif
diff --git a/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10.h b/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10.h
new file mode 100644
index 00000000000..3a5d0e33180
--- /dev/null
+++ b/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10.h
@@ -0,0 +1,183 @@
+/* file: philox4x32x10.h */
+/*******************************************************************************
+* Copyright contributors to the oneDAL project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/*
+//++
+//  Implementation of the Philox4x32-10 engine: a counter-based pseudorandom number generator (PRNG)
+//  that uses 4x32-bit keys and performs 10 rounds of mixing to produce high-quality randomness.
+//--
+*/
+
+#ifndef __PHILOX4X32X10_H__
+#define __PHILOX4X32X10_H__
+
+#include "algorithms/engines/philox4x32x10/philox4x32x10_types.h"
+#include "algorithms/engines/engine.h"
+
+namespace daal
+{
+namespace algorithms
+{
+namespace engines
+{
+namespace philox4x32x10
+{
+/**
+ * @defgroup engines_philox4x32x10_batch Batch
+ * @ingroup engines_philox4x32x10
+ * @{
+ */
+namespace interface1
+{
+/**
+ * <a name="DAAL-CLASS-ALGORITHMS__ENGINES__philox4x32x10__BATCHCONTAINER"></a>
+ * \brief Provides methods to run implementations of the philox4x32x10 engine.
+ *        This class is associated with the \ref philox4x32x10::interface1::Batch "philox4x32x10::Batch" class
+ *        and supports the method of philox4x32x10 engine computation in the batch processing mode
+ *
+ * \tparam algorithmFPType  Data type to use in intermediate computations of philox4x32x10 engine, double or float
+ * \tparam method           Computation method of the engine, philox4x32x10::Method
+ * \tparam cpu              Version of the cpu-specific implementation of the engine, daal::CpuType
+ */
+template <typename algorithmFPType, Method method, CpuType cpu>
+class BatchContainer : public daal::algorithms::AnalysisContainerIface<batch>
+{
+public:
+    /**
+     * Constructs a container for the philox4x32x10 engine with a specified environment
+     * in the batch processing mode
+     * \param[in] daalEnv   Environment object
+     */
+    BatchContainer(daal::services::Environment::env * daalEnv);
+    ~BatchContainer();
+    /**
+     * Computes the result of the philox4x32x10 engine in the batch processing mode
+     *
+     * \return Status of computations
+     */
+    services::Status compute() DAAL_C11_OVERRIDE;
+};
+
+/**
+ * <a name="DAAL-CLASS-ALGORITHMS__ENGINES__philox4x32x10__BATCH"></a>
+ * \brief Provides methods for philox4x32x10 engine computations in the batch processing mode
+ *
+ * \tparam algorithmFPType  Data type to use in intermediate computations of philox4x32x10 engine, double or float
+ * \tparam method           Computation method of the engine, philox4x32x10::Method
+ *
+ * \par Enumerations
+ *      - philox4x32x10::Method          Computation methods for the philox4x32x10 engine
+ *
+ * \par References
+ *      - \ref engines::interface1::Input  "engines::Input" class
+ *      - \ref engines::interface1::Result "engines::Result" class
+ */
+template <typename algorithmFPType = DAAL_ALGORITHM_FP_TYPE, Method method = defaultDense>
+class DAAL_EXPORT Batch : public engines::BatchBase
+{
+public:
+    typedef engines::BatchBase super;
+
+    typedef typename super::InputType InputType;
+    typedef typename super::ResultType ResultType;
+
+    /**
+     * Creates philox4x32x10 engine
+     * \param[in] seed  Initial condition for philox4x32x10 engine
+     *
+     * \return Pointer to philox4x32x10 engine
+     */
+    static services::SharedPtr<Batch<algorithmFPType, method> > create(size_t seed = 777);
+
+    /**
+     * Returns method of the engine
+     * \return Method of the engine
+     */
+    virtual int getMethod() const DAAL_C11_OVERRIDE { return (int)method; }
+
+    /**
+     * Returns the structure that contains results of philox4x32x10 engine
+     * \return Structure that contains results of philox4x32x10 engine
+     */
+    ResultPtr getResult() { return _result; }
+
+    /**
+     * Registers user-allocated memory to store results of philox4x32x10 engine
+     * \param[in] result  Structure to store results of philox4x32x10 engine
+     *
+     * \return Status of computations
+     */
+    services::Status setResult(const ResultPtr & result)
+    {
+        DAAL_CHECK(result, services::ErrorNullResult)
+        _result = result;
+        _res    = _result.get();
+        return services::Status();
+    }
+
+    /**
+     * Returns a pointer to the newly allocated philox4x32x10 engine
+     * with a copy of input objects and parameters of this philox4x32x10 engine
+     * \return Pointer to the newly allocated engine
+     */
+    services::SharedPtr<Batch<algorithmFPType, method> > clone() const { return services::SharedPtr<Batch<algorithmFPType, method> >(cloneImpl()); }
+
+    /**
+     * Allocates memory to store the result of the philox4x32x10 engine
+     *
+     * \return Status of computations
+     */
+    virtual services::Status allocateResult() DAAL_C11_OVERRIDE
+    {
+        services::Status s = this->_result->template allocate<algorithmFPType>(&(this->input), NULL, (int)method);
+        this->_res         = this->_result.get();
+        return s;
+    }
+
+protected:
+    Batch(size_t seed = 777) { initialize(); }
+
+    Batch(const Batch<algorithmFPType, method> & other) : super(other) { initialize(); }
+
+    virtual Batch<algorithmFPType, method> * cloneImpl() const DAAL_C11_OVERRIDE { return new Batch<algorithmFPType, method>(*this); }
+
+    void initialize()
+    {
+        Analysis<batch>::_ac = new __DAAL_ALGORITHM_CONTAINER(batch, BatchContainer, algorithmFPType, method)(&_env);
+        _in                  = &input;
+        _result.reset(new ResultType());
+    }
+
+private:
+    ResultPtr _result;
+
+    Batch & operator=(const Batch &);
+};
+typedef services::SharedPtr<Batch<> > philox4x32x10Ptr;
+typedef services::SharedPtr<const Batch<> > philox4x32x10ConstPtr;
+
+} // namespace interface1
+using interface1::BatchContainer;
+using interface1::Batch;
+using interface1::philox4x32x10Ptr;
+using interface1::philox4x32x10ConstPtr;
+/** @} */
+} // namespace philox4x32x10
+} // namespace engines
+} // namespace algorithms
+} // namespace daal
+#endif
diff --git a/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10_types.h b/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10_types.h
new file mode 100644
index 00000000000..0c0a92c9b3a
--- /dev/null
+++ b/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10_types.h
@@ -0,0 +1,65 @@
+/* file: philox4x32x10_types.h */
+/*******************************************************************************
+* Copyright contributors to the oneDAL project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/*
+//++
+//  Implementation of the Philox4x32-10 engine: a counter-based pseudorandom number generator (PRNG)
+//  that uses 4x32-bit keys and performs 10 rounds of mixing to produce high-quality randomness.
+//--
+*/
+
+#ifndef __PHILOX4X32X10_TYPES_H__
+#define __PHILOX4X32X10_TYPES_H__
+
+#include "algorithms/algorithm.h"
+#include "services/daal_defines.h"
+#include "data_management/data/numeric_table.h"
+#include "data_management/data/homogen_numeric_table.h"
+
+namespace daal
+{
+namespace algorithms
+{
+namespace engines
+{
+/**
+ * @defgroup engines_philox4x32x10 philox4x32x10 Engine
+ * \copydoc daal::algorithms::engines::philox4x32x10
+ * @ingroup engines
+ * @{
+ */
+/**
+ * \brief Contains classes for philox4x32x10 engine
+ */
+namespace philox4x32x10
+{
+/**
+ * <a name="DAAL-ENUM-ALGORITHMS__ENGINES__philox4x32x10__METHOD"></a>
+ * Available methods to compute philox4x32x10 engine
+ */
+enum Method
+{
+    defaultDense = 0 /*!< Default: performance-oriented method. */
+};
+
+} // namespace philox4x32x10
+/** @} */
+} // namespace engines
+} // namespace algorithms
+} // namespace daal
+
+#endif
diff --git a/cpp/daal/include/daal.h b/cpp/daal/include/daal.h
index 881a6c39fbe..f49625f8939 100755
--- a/cpp/daal/include/daal.h
+++ b/cpp/daal/include/daal.h
@@ -297,13 +297,17 @@
 #include "algorithms/distributions/bernoulli/bernoulli.h"
 #include "algorithms/distributions/bernoulli/bernoulli_types.h"
 #include "algorithms/engines/engine.h"
+#include "algorithms/engines/engine_family.h"
+#include "algorithms/engines/mt2203/mt2203.h"
+#include "algorithms/engines/mt2203/mt2203_types.h"
 #include "algorithms/engines/mt19937/mt19937.h"
 #include "algorithms/engines/mt19937/mt19937_types.h"
 #include "algorithms/engines/mcg59/mcg59.h"
 #include "algorithms/engines/mcg59/mcg59_types.h"
-#include "algorithms/engines/engine_family.h"
-#include "algorithms/engines/mt2203/mt2203.h"
-#include "algorithms/engines/mt2203/mt2203_types.h"
+#include "algorithms/engines/mrg32k3a/mrg32k3a.h"
+#include "algorithms/engines/mrg32k3a/mrg32k3a_types.h"
+#include "algorithms/engines/philox4x32x10/philox4x32x10.h"
+#include "algorithms/engines/philox4x32x10/philox4x32x10_types.h"
 #include "algorithms/dbscan/dbscan_types.h"
 #include "algorithms/dbscan/dbscan_batch.h"
 #include "algorithms/dbscan/dbscan_distributed.h"
diff --git a/cpp/daal/include/daal_win.h b/cpp/daal/include/daal_win.h
index e17eff16796..a15ed7db26e 100755
--- a/cpp/daal/include/daal_win.h
+++ b/cpp/daal/include/daal_win.h
@@ -309,13 +309,17 @@
 #include "algorithms/distributions/bernoulli/bernoulli.h"
 #include "algorithms/distributions/bernoulli/bernoulli_types.h"
 #include "algorithms/engines/engine.h"
+#include "algorithms/engines/engine_family.h"
+#include "algorithms/engines/mt2203/mt2203.h"
+#include "algorithms/engines/mt2203/mt2203_types.h"
 #include "algorithms/engines/mt19937/mt19937.h"
 #include "algorithms/engines/mt19937/mt19937_types.h"
 #include "algorithms/engines/mcg59/mcg59.h"
 #include "algorithms/engines/mcg59/mcg59_types.h"
-#include "algorithms/engines/engine_family.h"
-#include "algorithms/engines/mt2203/mt2203.h"
-#include "algorithms/engines/mt2203/mt2203_types.h"
+#include "algorithms/engines/mrg32k3a/mrg32k3a.h"
+#include "algorithms/engines/mrg32k3a/mrg32k3a_types.h"
+#include "algorithms/engines/philox4x32x10/philox4x32x10.h"
+#include "algorithms/engines/philox4x32x10/philox4x32x10_types.h"
 #include "algorithms/dbscan/dbscan_types.h"
 #include "algorithms/dbscan/dbscan_batch.h"
 #include "algorithms/dbscan/dbscan_distributed.h"
diff --git a/cpp/daal/src/algorithms/engines/mcg59/mcg59_batch_impl.h b/cpp/daal/src/algorithms/engines/mcg59/mcg59_batch_impl.h
index 6c3040da615..62f337ba9a0 100644
--- a/cpp/daal/src/algorithms/engines/mcg59/mcg59_batch_impl.h
+++ b/cpp/daal/src/algorithms/engines/mcg59/mcg59_batch_impl.h
@@ -26,9 +26,6 @@
 #include "src/externals/service_rng.h"
 #include "src/data_management/service_numeric_table.h"
 
-static const int leapfrogMethodErrcode  = -1002;
-static const int skipAheadMethodErrcode = -1003;
-
 namespace daal
 {
 namespace algorithms
@@ -67,7 +64,7 @@ class BatchImpl : public algorithms::engines::mcg59::interface1::Batch<algorithm
     {
         int errcode = baseRng.leapfrog(threadNum, nThreads);
         services::Status s;
-        if (errcode == leapfrogMethodErrcode)
+        if (errcode == __DAAL_RNG_ERROR_LEAPFROG_UNSUPPORTED)
             s.add(ErrorLeapfrogUnsupported);
         else if (errcode)
             s.add(ErrorIncorrectErrorcodeFromGenerator);
@@ -78,7 +75,7 @@ class BatchImpl : public algorithms::engines::mcg59::interface1::Batch<algorithm
     {
         int errcode = baseRng.skipAhead(nSkip);
         services::Status s;
-        if (errcode == skipAheadMethodErrcode)
+        if (errcode == __DAAL_RNG_ERROR_SKIPAHEAD_UNSUPPORTED)
             s.add(ErrorSkipAheadUnsupported);
         else if (errcode)
             s.add(ErrorIncorrectErrorcodeFromGenerator);
diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a.cpp b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a.cpp
new file mode 100644
index 00000000000..c550d81dec6
--- /dev/null
+++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a.cpp
@@ -0,0 +1,59 @@
+/* file: mrg32k3a.cpp */
+/*******************************************************************************
+* Copyright contributors to the oneDAL project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+//++
+//  Implementation of the MRG32k3a engine: a 32-bit combined multiple recursive generator
+//  with two components of order 3, optimized for batch processing.
+//--
+
+#include "algorithms/engines/mrg32k3a/mrg32k3a.h"
+#include "src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h"
+#include "src/externals/service_dispatch.h"
+
+namespace daal
+{
+namespace algorithms
+{
+namespace engines
+{
+namespace mrg32k3a
+{
+namespace interface1
+{
+using namespace daal::services;
+using namespace mrg32k3a::internal;
+
+template <typename algorithmFPType, Method method>
+SharedPtr<Batch<algorithmFPType, method> > Batch<algorithmFPType, method>::create(size_t seed)
+{
+    SharedPtr<Batch<algorithmFPType, method> > engPtr;
+#define DAAL_CREATE_ENGINE_CPU(cpuId, ...) engPtr.reset(new BatchImpl<cpuId, algorithmFPType, method>(__VA_ARGS__));
+
+    DAAL_DISPATCH_FUNCTION_BY_CPU(DAAL_CREATE_ENGINE_CPU, seed);
+
+#undef DAAL_CREATE_ENGINE_CPU
+    return engPtr;
+}
+
+template class Batch<double, defaultDense>;
+template class Batch<float, defaultDense>;
+
+} // namespace interface1
+} // namespace mrg32k3a
+} // namespace engines
+} // namespace algorithms
+} // namespace daal
diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_container.h b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_container.h
new file mode 100644
index 00000000000..ce83f554026
--- /dev/null
+++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_container.h
@@ -0,0 +1,68 @@
+/* file: mrg32k3a_batch_container.h */
+/*******************************************************************************
+* Copyright contributors to the oneDAL project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/*
+//++
+//  Implementation of mrg32k3a calculation algorithm container.
+//--
+*/
+
+#ifndef __mrg32k3a_BATCH_CONTAINER_H__
+#define __mrg32k3a_BATCH_CONTAINER_H__
+
+#include "algorithms/engines/mrg32k3a/mrg32k3a.h"
+#include "src/algorithms/engines/mrg32k3a/mrg32k3a_kernel.h"
+
+namespace daal
+{
+namespace algorithms
+{
+namespace engines
+{
+namespace mrg32k3a
+{
+namespace interface1
+{
+template <typename algorithmFPType, Method method, CpuType cpu>
+BatchContainer<algorithmFPType, method, cpu>::BatchContainer(daal::services::Environment::env * daalEnv) : AnalysisContainerIface<batch>(daalEnv)
+{
+    __DAAL_INITIALIZE_KERNELS(internal::mrg32k3aKernel, algorithmFPType, method);
+}
+
+template <typename algorithmFPType, Method method, CpuType cpu>
+BatchContainer<algorithmFPType, method, cpu>::~BatchContainer()
+{
+    __DAAL_DEINITIALIZE_KERNELS();
+}
+
+template <typename algorithmFPType, Method method, CpuType cpu>
+services::Status BatchContainer<algorithmFPType, method, cpu>::compute()
+{
+    daal::services::Environment::env & env = *_env;
+    engines::Result * result               = static_cast<engines::Result *>(_res);
+    NumericTable * resultTable             = result->get(engines::randomNumbers).get();
+
+    __DAAL_CALL_KERNEL(env, internal::mrg32k3aKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, resultTable);
+}
+
+} // namespace interface1
+} // namespace mrg32k3a
+} // namespace engines
+} // namespace algorithms
+} // namespace daal
+
+#endif
diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h
new file mode 100644
index 00000000000..9c226e54af3
--- /dev/null
+++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h
@@ -0,0 +1,114 @@
+/* file: mrg32k3a_batch_impl.h */
+/*******************************************************************************
+* Copyright contributors to the oneDAL project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/*
+//++
+//  Implementation of the class defining the mrg32k3a engine.
+//--
+*/
+
+#include "algorithms/engines/mrg32k3a/mrg32k3a.h"
+#include "src/algorithms/engines/engine_batch_impl.h"
+#include "src/externals/service_rng.h"
+#include "src/data_management/service_numeric_table.h"
+
+namespace daal
+{
+namespace algorithms
+{
+namespace engines
+{
+namespace mrg32k3a
+{
+namespace internal
+{
+template <CpuType cpu, typename algorithmFPType = DAAL_ALGORITHM_FP_TYPE, Method method = defaultDense>
+class BatchImpl : public algorithms::engines::mrg32k3a::interface1::Batch<algorithmFPType, method>,
+                  public algorithms::engines::internal::BatchBaseImpl
+{
+public:
+    typedef algorithms::engines::mrg32k3a::interface1::Batch<algorithmFPType, method> super1;
+    typedef algorithms::engines::internal::BatchBaseImpl super2;
+    BatchImpl(size_t seed = 777) : baseRng(seed, __DAAL_BRNG_MRG32K3A), super2(seed) {}
+
+    void * getState() DAAL_C11_OVERRIDE { return baseRng.getState(); }
+
+    int getStateSize() const DAAL_C11_OVERRIDE { return baseRng.getStateSize(); }
+
+    services::Status saveStateImpl(byte * dest) const DAAL_C11_OVERRIDE
+    {
+        DAAL_CHECK(!baseRng.saveState((void *)dest), ErrorIncorrectErrorcodeFromGenerator);
+        return services::Status();
+    }
+
+    services::Status loadStateImpl(const byte * src) DAAL_C11_OVERRIDE
+    {
+        DAAL_CHECK(!baseRng.loadState((const void *)src), ErrorIncorrectErrorcodeFromGenerator);
+        return services::Status();
+    }
+
+    services::Status leapfrogImpl(size_t threadNum, size_t nThreads) DAAL_C11_OVERRIDE
+    {
+        int errcode = baseRng.leapfrog(threadNum, nThreads);
+        services::Status s;
+        if (errcode == __DAAL_RNG_ERROR_LEAPFROG_UNSUPPORTED)
+            s.add(ErrorLeapfrogUnsupported);
+        else if (errcode)
+            s.add(ErrorIncorrectErrorcodeFromGenerator);
+        return s;
+    }
+
+    services::Status skipAheadImpl(size_t nSkip) DAAL_C11_OVERRIDE
+    {
+        int errcode = baseRng.skipAhead(nSkip);
+        services::Status s;
+        if (errcode == __DAAL_RNG_ERROR_SKIPAHEAD_UNSUPPORTED)
+            s.add(ErrorSkipAheadUnsupported);
+        else if (errcode)
+            s.add(ErrorIncorrectErrorcodeFromGenerator);
+        return s;
+    }
+
+    virtual BatchImpl<cpu, algorithmFPType, method> * cloneImpl() const DAAL_C11_OVERRIDE
+    {
+        return new BatchImpl<cpu, algorithmFPType, method>(*this);
+    }
+
+    bool hasSupport(engines::internal::ParallelizationTechnique technique) const DAAL_C11_OVERRIDE
+    {
+        switch (technique)
+        {
+        case engines::internal::family: return false;
+        case engines::internal::skipahead: return true;
+        case engines::internal::leapfrog: return true;
+        }
+        return false;
+    }
+
+    ~BatchImpl() {}
+
+protected:
+    BatchImpl(const BatchImpl<cpu, algorithmFPType, method> & other) : super1(other), super2(other), baseRng(other.baseRng) {}
+
+    daal::internal::BaseRNGsInst<cpu> baseRng;
+};
+
+} // namespace internal
+} // namespace mrg32k3a
+} // namespace engines
+} // namespace algorithms
+} // namespace daal
diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_cpu.cpp b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_cpu.cpp
new file mode 100644
index 00000000000..529c4af2635
--- /dev/null
+++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_cpu.cpp
@@ -0,0 +1,47 @@
+/* file: mrg32k3a_dense_default_batch_fpt_cpu.cpp */
+/*******************************************************************************
+* Copyright contributors to the oneDAL project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+//++
+//  Implementation of mrg32k3a calculation functions.
+//--
+
+#include "src/algorithms/engines/mrg32k3a/mrg32k3a_batch_container.h"
+#include "src/algorithms/engines/mrg32k3a/mrg32k3a_kernel.h"
+#include "src/algorithms/engines/mrg32k3a/mrg32k3a_impl.i"
+
+namespace daal
+{
+namespace algorithms
+{
+namespace engines
+{
+namespace mrg32k3a
+{
+namespace interface1
+{
+template class BatchContainer<DAAL_FPTYPE, defaultDense, DAAL_CPU>;
+} // namespace interface1
+
+namespace internal
+{
+template class mrg32k3aKernel<DAAL_FPTYPE, defaultDense, DAAL_CPU>;
+} // namespace internal
+
+} // namespace mrg32k3a
+} // namespace engines
+} // namespace algorithms
+} // namespace daal
diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_dispatcher.cpp
new file mode 100644
index 00000000000..fd78108df73
--- /dev/null
+++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_dispatcher.cpp
@@ -0,0 +1,30 @@
+/* file: mrg32k3a_dense_default_batch_fpt_dispatcher.cpp */
+/*******************************************************************************
+* Copyright contributors to the oneDAL project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+//++
+//  Implementation of mrg32k3a calculation algorithm dispatcher.
+//--
+
+#include "src/algorithms/engines/mrg32k3a/mrg32k3a_batch_container.h"
+
+namespace daal
+{
+namespace algorithms
+{
+__DAAL_INSTANTIATE_DISPATCH_CONTAINER(engines::mrg32k3a::BatchContainer, batch, DAAL_FPTYPE, engines::mrg32k3a::defaultDense)
+} // namespace algorithms
+} // namespace daal
diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_impl.i b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_impl.i
new file mode 100644
index 00000000000..f8f12b2deea
--- /dev/null
+++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_impl.i
@@ -0,0 +1,49 @@
+/* file: mrg32k3a_impl.i */
+/*******************************************************************************
+* Copyright contributors to the oneDAL project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/*
+//++
+//  Implementation of mrg32k3a algorithm.
+//--
+*/
+
+#ifndef __MRG32K3A_IMPL_I__
+#define __MRG32K3A_IMPL_I__
+
+namespace daal
+{
+namespace algorithms
+{
+namespace engines
+{
+namespace mrg32k3a
+{
+namespace internal
+{
+template <typename algorithmFPType, Method method, CpuType cpu>
+Status mrg32k3aKernel<algorithmFPType, method, cpu>::compute(NumericTable * resultTensor)
+{
+    return Status();
+}
+
+} // namespace internal
+} // namespace mrg32k3a
+} // namespace engines
+} // namespace algorithms
+} // namespace daal
+
+#endif
diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_kernel.h b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_kernel.h
new file mode 100644
index 00000000000..80c9fbe44d9
--- /dev/null
+++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_kernel.h
@@ -0,0 +1,58 @@
+/* file: mrg32k3a_kernel.h */
+/*******************************************************************************
+* Copyright contributors to the oneDAL project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+//++
+//  Declaration of a template function for calculating values using the MRG32k3a generator.
+//--
+
+#ifndef __MRG32K3A_KERNEL_H__
+#define __MRG32K3A_KERNEL_H__
+
+#include "algorithms/engines/mrg32k3a/mrg32k3a.h"
+#include "src/algorithms/kernel.h"
+#include "data_management/data/numeric_table.h"
+
+using namespace daal::services;
+using namespace daal::data_management;
+
+namespace daal
+{
+namespace algorithms
+{
+namespace engines
+{
+namespace mrg32k3a
+{
+namespace internal
+{
+/**
+ *  \brief Kernel for mrg32k3a calculation
+ */
+template <typename algorithmFPType, Method method, CpuType cpu>
+class mrg32k3aKernel : public Kernel
+{
+public:
+    Status compute(NumericTable * resultTable);
+};
+
+} // namespace internal
+} // namespace mrg32k3a
+} // namespace engines
+} // namespace algorithms
+} // namespace daal
+
+#endif
diff --git a/cpp/daal/src/algorithms/engines/mt19937/mt19937_batch_impl.h b/cpp/daal/src/algorithms/engines/mt19937/mt19937_batch_impl.h
index e92d0e46612..805ded3153c 100644
--- a/cpp/daal/src/algorithms/engines/mt19937/mt19937_batch_impl.h
+++ b/cpp/daal/src/algorithms/engines/mt19937/mt19937_batch_impl.h
@@ -26,9 +26,6 @@
 #include "src/externals/service_rng.h"
 #include "src/data_management/service_numeric_table.h"
 
-static const int leapfrogMethodErrcode  = -1002;
-static const int skipAheadMethodErrcode = -1003;
-
 namespace daal
 {
 namespace algorithms
@@ -67,7 +64,7 @@ class BatchImpl : public algorithms::engines::mt19937::interface1::Batch<algorit
     {
         int errcode = baseRng.leapfrog(threadNum, nThreads);
         services::Status s;
-        if (errcode == leapfrogMethodErrcode)
+        if (errcode == __DAAL_RNG_ERROR_LEAPFROG_UNSUPPORTED)
             s.add(ErrorLeapfrogUnsupported);
         else if (errcode)
             s.add(ErrorIncorrectErrorcodeFromGenerator);
@@ -78,7 +75,7 @@ class BatchImpl : public algorithms::engines::mt19937::interface1::Batch<algorit
     {
         int errcode = baseRng.skipAhead(nSkip);
         services::Status s;
-        if (errcode == skipAheadMethodErrcode)
+        if (errcode == __DAAL_RNG_ERROR_SKIPAHEAD_UNSUPPORTED)
             s.add(ErrorSkipAheadUnsupported);
         else if (errcode)
             s.add(ErrorIncorrectErrorcodeFromGenerator);
diff --git a/cpp/daal/src/algorithms/engines/mt2203/mt2203_batch_impl.h b/cpp/daal/src/algorithms/engines/mt2203/mt2203_batch_impl.h
index ca8c01efd5f..4899104aff9 100644
--- a/cpp/daal/src/algorithms/engines/mt2203/mt2203_batch_impl.h
+++ b/cpp/daal/src/algorithms/engines/mt2203/mt2203_batch_impl.h
@@ -27,9 +27,6 @@
 #include "src/data_management/service_numeric_table.h"
 #include "services/collection.h"
 
-static const int leapfrogMethodErrcode  = -1002;
-static const int skipAheadMethodErrcode = -1003;
-
 namespace daal
 {
 namespace algorithms
@@ -188,7 +185,7 @@ class BatchImpl : public algorithms::engines::mt2203::interface1::Batch<algorith
     {
         int errcode = _streams[i]->leapfrog(threadNum, nThreads);
         services::Status s;
-        if (errcode == leapfrogMethodErrcode)
+        if (errcode == __DAAL_RNG_ERROR_LEAPFROG_UNSUPPORTED)
             s.add(ErrorLeapfrogUnsupported);
         else if (errcode)
             s.add(ErrorIncorrectErrorcodeFromGenerator);
@@ -199,7 +196,7 @@ class BatchImpl : public algorithms::engines::mt2203::interface1::Batch<algorith
     {
         int errcode = _streams[i]->skipAhead(nSkip);
         services::Status s;
-        if (errcode == skipAheadMethodErrcode)
+        if (errcode == __DAAL_RNG_ERROR_SKIPAHEAD_UNSUPPORTED)
             s.add(ErrorSkipAheadUnsupported);
         else if (errcode)
             s.add(ErrorIncorrectErrorcodeFromGenerator);
diff --git a/cpp/daal/src/algorithms/engines/mt2203/mt2203_kernel.h b/cpp/daal/src/algorithms/engines/mt2203/mt2203_kernel.h
index b7de119367f..e588a02c8fb 100644
--- a/cpp/daal/src/algorithms/engines/mt2203/mt2203_kernel.h
+++ b/cpp/daal/src/algorithms/engines/mt2203/mt2203_kernel.h
@@ -19,8 +19,8 @@
 //  Declaration of template function that calculate mt2203s.
 //--
 
-#ifndef __MCG59_KERNEL_H__
-#define __MCG59_KERNEL_H__
+#ifndef __MT2203_KERNEL_H__
+#define __MT2203_KERNEL_H__
 
 #include "algorithms/engines/mt2203/mt2203.h"
 #include "src/algorithms/kernel.h"
diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10.cpp b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10.cpp
new file mode 100644
index 00000000000..47fb7dae70f
--- /dev/null
+++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10.cpp
@@ -0,0 +1,59 @@
+/* file: philox4x32x10.cpp */
+/*******************************************************************************
+* Copyright contributors to the oneDAL project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+//++
+//  Implementation of the Philox4x32-10 engine: a counter-based pseudorandom number generator (PRNG)
+//  that uses 4x32-bit keys and performs 10 rounds of mixing to produce high-quality randomness.
+//--
+
+#include "algorithms/engines/philox4x32x10/philox4x32x10.h"
+#include "src/externals/service_dispatch.h"
+#include "src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h"
+
+namespace daal
+{
+namespace algorithms
+{
+namespace engines
+{
+namespace philox4x32x10
+{
+namespace interface1
+{
+using namespace daal::services;
+using namespace philox4x32x10::internal;
+
+template <typename algorithmFPType, Method method>
+SharedPtr<Batch<algorithmFPType, method> > Batch<algorithmFPType, method>::create(size_t seed)
+{
+    SharedPtr<Batch<algorithmFPType, method> > engPtr;
+#define DAAL_CREATE_ENGINE_CPU(cpuId, ...) engPtr.reset(new BatchImpl<cpuId, algorithmFPType, method>(__VA_ARGS__));
+
+    DAAL_DISPATCH_FUNCTION_BY_CPU(DAAL_CREATE_ENGINE_CPU, seed);
+
+#undef DAAL_CREATE_ENGINE_CPU
+    return engPtr;
+}
+
+template class Batch<double, defaultDense>;
+template class Batch<float, defaultDense>;
+
+} // namespace interface1
+} // namespace philox4x32x10
+} // namespace engines
+} // namespace algorithms
+} // namespace daal
diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_container.h b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_container.h
new file mode 100644
index 00000000000..9cb747e95a8
--- /dev/null
+++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_container.h
@@ -0,0 +1,68 @@
+/* file: philox4x32x10_batch_container.h */
+/*******************************************************************************
+* Copyright contributors to the oneDAL project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/*
+//++
+//  Implementation of philox4x32x10 calculation algorithm container.
+//--
+*/
+
+#ifndef __PHILOX4X32X10_BATCH_CONTAINER_H__
+#define __PHILOX4X32X10_BATCH_CONTAINER_H__
+
+#include "algorithms/engines/philox4x32x10/philox4x32x10.h"
+#include "src/algorithms/engines/philox4x32x10/philox4x32x10_kernel.h"
+
+namespace daal
+{
+namespace algorithms
+{
+namespace engines
+{
+namespace philox4x32x10
+{
+namespace interface1
+{
+template <typename algorithmFPType, Method method, CpuType cpu>
+BatchContainer<algorithmFPType, method, cpu>::BatchContainer(daal::services::Environment::env * daalEnv) : AnalysisContainerIface<batch>(daalEnv)
+{
+    __DAAL_INITIALIZE_KERNELS(internal::philox4x32x10Kernel, algorithmFPType, method);
+}
+
+template <typename algorithmFPType, Method method, CpuType cpu>
+BatchContainer<algorithmFPType, method, cpu>::~BatchContainer()
+{
+    __DAAL_DEINITIALIZE_KERNELS();
+}
+
+template <typename algorithmFPType, Method method, CpuType cpu>
+services::Status BatchContainer<algorithmFPType, method, cpu>::compute()
+{
+    daal::services::Environment::env & env = *_env;
+    engines::Result * result               = static_cast<engines::Result *>(_res);
+    NumericTable * resultTable             = result->get(engines::randomNumbers).get();
+
+    __DAAL_CALL_KERNEL(env, internal::philox4x32x10Kernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, resultTable);
+}
+
+} // namespace interface1
+} // namespace philox4x32x10
+} // namespace engines
+} // namespace algorithms
+} // namespace daal
+
+#endif
diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h
new file mode 100644
index 00000000000..1f7b40526ac
--- /dev/null
+++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h
@@ -0,0 +1,114 @@
+/* file: philox4x32x10_batch_impl.h */
+/*******************************************************************************
+* Copyright contributors to the oneDAL project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/*
+//++
+//  Implementation of the class defining the philox4x32x10 engine
+//--
+*/
+
+#include "algorithms/engines/philox4x32x10/philox4x32x10.h"
+#include "src/algorithms/engines/engine_batch_impl.h"
+#include "src/externals/service_rng.h"
+#include "src/data_management/service_numeric_table.h"
+
+namespace daal
+{
+namespace algorithms
+{
+namespace engines
+{
+namespace philox4x32x10
+{
+namespace internal
+{
+template <CpuType cpu, typename algorithmFPType = DAAL_ALGORITHM_FP_TYPE, Method method = defaultDense>
+class BatchImpl : public algorithms::engines::philox4x32x10::interface1::Batch<algorithmFPType, method>,
+                  public algorithms::engines::internal::BatchBaseImpl
+{
+public:
+    typedef algorithms::engines::philox4x32x10::interface1::Batch<algorithmFPType, method> super1;
+    typedef algorithms::engines::internal::BatchBaseImpl super2;
+    BatchImpl(size_t seed = 777) : baseRng(seed, __DAAL_BRNG_PHILOX4X32X10), super2(seed) {}
+
+    void * getState() DAAL_C11_OVERRIDE { return baseRng.getState(); }
+
+    int getStateSize() const DAAL_C11_OVERRIDE { return baseRng.getStateSize(); }
+
+    services::Status saveStateImpl(byte * dest) const DAAL_C11_OVERRIDE
+    {
+        DAAL_CHECK(!baseRng.saveState((void *)dest), ErrorIncorrectErrorcodeFromGenerator);
+        return services::Status();
+    }
+
+    services::Status loadStateImpl(const byte * src) DAAL_C11_OVERRIDE
+    {
+        DAAL_CHECK(!baseRng.loadState((const void *)src), ErrorIncorrectErrorcodeFromGenerator);
+        return services::Status();
+    }
+
+    services::Status leapfrogImpl(size_t threadNum, size_t nThreads) DAAL_C11_OVERRIDE
+    {
+        int errcode = baseRng.leapfrog(threadNum, nThreads);
+        services::Status s;
+        if (errcode == __DAAL_RNG_ERROR_LEAPFROG_UNSUPPORTED)
+            s.add(ErrorLeapfrogUnsupported);
+        else if (errcode)
+            s.add(ErrorIncorrectErrorcodeFromGenerator);
+        return s;
+    }
+
+    services::Status skipAheadImpl(size_t nSkip) DAAL_C11_OVERRIDE
+    {
+        int errcode = baseRng.skipAhead(nSkip);
+        services::Status s;
+        if (errcode == __DAAL_RNG_ERROR_SKIPAHEAD_UNSUPPORTED)
+            s.add(ErrorSkipAheadUnsupported);
+        else if (errcode)
+            s.add(ErrorIncorrectErrorcodeFromGenerator);
+        return s;
+    }
+
+    virtual BatchImpl<cpu, algorithmFPType, method> * cloneImpl() const DAAL_C11_OVERRIDE
+    {
+        return new BatchImpl<cpu, algorithmFPType, method>(*this);
+    }
+
+    bool hasSupport(engines::internal::ParallelizationTechnique technique) const DAAL_C11_OVERRIDE
+    {
+        switch (technique)
+        {
+        case engines::internal::family: return false;
+        case engines::internal::skipahead: return true;
+        case engines::internal::leapfrog: return false;
+        }
+        return false;
+    }
+
+    ~BatchImpl() {}
+
+protected:
+    BatchImpl(const BatchImpl<cpu, algorithmFPType, method> & other) : super1(other), super2(other), baseRng(other.baseRng) {}
+
+    daal::internal::BaseRNGsInst<cpu> baseRng;
+};
+
+} // namespace internal
+} // namespace philox4x32x10
+} // namespace engines
+} // namespace algorithms
+} // namespace daal
diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_cpu.cpp b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_cpu.cpp
new file mode 100644
index 00000000000..946517c1d9c
--- /dev/null
+++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_cpu.cpp
@@ -0,0 +1,47 @@
+/* file: philox4x32x10_dense_default_batch_fpt_cpu.cpp */
+/*******************************************************************************
+* Copyright contributors to the oneDAL project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+//++
+//  Implementation of philox4x32x10 calculation functions.
+//--
+
+#include "src/algorithms/engines/philox4x32x10/philox4x32x10_batch_container.h"
+#include "src/algorithms/engines/philox4x32x10/philox4x32x10_kernel.h"
+#include "src/algorithms/engines/philox4x32x10/philox4x32x10_impl.i"
+
+namespace daal
+{
+namespace algorithms
+{
+namespace engines
+{
+namespace philox4x32x10
+{
+namespace interface1
+{
+template class BatchContainer<DAAL_FPTYPE, defaultDense, DAAL_CPU>;
+} // namespace interface1
+
+namespace internal
+{
+template class philox4x32x10Kernel<DAAL_FPTYPE, defaultDense, DAAL_CPU>;
+} // namespace internal
+
+} // namespace philox4x32x10
+} // namespace engines
+} // namespace algorithms
+} // namespace daal
diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_dispatcher.cpp
new file mode 100644
index 00000000000..1640fc4ec12
--- /dev/null
+++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_dispatcher.cpp
@@ -0,0 +1,30 @@
+/* file: philox4x32x10_dense_default_batch_fpt_dispatcher.cpp */
+/*******************************************************************************
+* Copyright contributors to the oneDAL project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+//++
+//  Implementation of philox4x32x10 calculation algorithm dispatcher.
+//--
+
+#include "src/algorithms/engines/philox4x32x10/philox4x32x10_batch_container.h"
+
+namespace daal
+{
+namespace algorithms
+{
+__DAAL_INSTANTIATE_DISPATCH_CONTAINER(engines::philox4x32x10::BatchContainer, batch, DAAL_FPTYPE, engines::philox4x32x10::defaultDense)
+} // namespace algorithms
+} // namespace daal
diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_impl.i b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_impl.i
new file mode 100644
index 00000000000..5aa5addc22b
--- /dev/null
+++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_impl.i
@@ -0,0 +1,49 @@
+/* file: philox4x32x10_impl.i */
+/*******************************************************************************
+* Copyright contributors to the oneDAL project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+/*
+//++
+//  Implementation of philox4x32x10 algorithm.
+//--
+*/
+
+#ifndef __PHILOX4X32X10_IMPL_I__
+#define __PHILOX4X32X10_IMPL_I__
+
+namespace daal
+{
+namespace algorithms
+{
+namespace engines
+{
+namespace philox4x32x10
+{
+namespace internal
+{
+template <typename algorithmFPType, Method method, CpuType cpu>
+Status philox4x32x10Kernel<algorithmFPType, method, cpu>::compute(NumericTable * resultTensor)
+{
+    return Status();
+}
+
+} // namespace internal
+} // namespace philox4x32x10
+} // namespace engines
+} // namespace algorithms
+} // namespace daal
+
+#endif
diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_kernel.h b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_kernel.h
new file mode 100644
index 00000000000..5870d781abd
--- /dev/null
+++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_kernel.h
@@ -0,0 +1,58 @@
+/* file: philox4x32x10_kernel.h */
+/*******************************************************************************
+* Copyright contributors to the oneDAL project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+//++
+//  Declaration of a template function for generating values using the Philox4x32-10 engine.
+//--
+
+#ifndef __PHILOX4X32X10_KERNEL_H__
+#define __PHILOX4X32X10_KERNEL_H__
+
+#include "algorithms/engines/philox4x32x10/philox4x32x10.h"
+#include "src/algorithms/kernel.h"
+#include "data_management/data/numeric_table.h"
+
+using namespace daal::services;
+using namespace daal::data_management;
+
+namespace daal
+{
+namespace algorithms
+{
+namespace engines
+{
+namespace philox4x32x10
+{
+namespace internal
+{
+/**
+ *  \brief Kernel for philox4x32x10 calculation
+ */
+template <typename algorithmFPType, Method method, CpuType cpu>
+class philox4x32x10Kernel : public Kernel
+{
+public:
+    Status compute(NumericTable * resultTable);
+};
+
+} // namespace internal
+} // namespace philox4x32x10
+} // namespace engines
+} // namespace algorithms
+} // namespace daal
+
+#endif
diff --git a/cpp/daal/src/externals/service_rng_mkl.h b/cpp/daal/src/externals/service_rng_mkl.h
index b2dcd81b78b..425695c7f66 100644
--- a/cpp/daal/src/externals/service_rng_mkl.h
+++ b/cpp/daal/src/externals/service_rng_mkl.h
@@ -32,6 +32,8 @@
 #define __DAAL_BRNG_MT2203                    VSL_BRNG_MT2203
 #define __DAAL_BRNG_MT19937                   VSL_BRNG_MT19937
 #define __DAAL_BRNG_MCG59                     VSL_BRNG_MCG59
+#define __DAAL_BRNG_MRG32K3A                  VSL_BRNG_MRG32K3A
+#define __DAAL_BRNG_PHILOX4X32X10             VSL_BRNG_PHILOX4X32X10
 #define __DAAL_RNG_METHOD_UNIFORM_STD         VSL_RNG_METHOD_UNIFORM_STD
 #define __DAAL_RNG_METHOD_UNIFORMBITS32_STD   0
 #define __DAAL_RNG_METHOD_BERNOULLI_ICDF      VSL_RNG_METHOD_BERNOULLI_ICDF
@@ -39,6 +41,10 @@
 #define __DAAL_RNG_METHOD_GAUSSIAN_BOXMULLER2 VSL_RNG_METHOD_GAUSSIAN_BOXMULLER2
 #define __DAAL_RNG_METHOD_GAUSSIAN_ICDF       VSL_RNG_METHOD_GAUSSIAN_ICDF
 
+// Errors
+#define __DAAL_RNG_ERROR_LEAPFROG_UNSUPPORTED  -1002
+#define __DAAL_RNG_ERROR_SKIPAHEAD_UNSUPPORTED -1003
+
 namespace daal
 {
 namespace internal
diff --git a/cpp/daal/src/externals/service_rng_openrng.h b/cpp/daal/src/externals/service_rng_openrng.h
index dd70c644606..0e49c62c83b 100644
--- a/cpp/daal/src/externals/service_rng_openrng.h
+++ b/cpp/daal/src/externals/service_rng_openrng.h
@@ -25,6 +25,8 @@
 #define __DAAL_BRNG_MT2203                    VSL_BRNG_MT2203
 #define __DAAL_BRNG_MT19937                   VSL_BRNG_MT19937
 #define __DAAL_BRNG_MCG59                     VSL_BRNG_MCG59
+#define __DAAL_BRNG_MRG32K3A                  VSL_BRNG_MRG32K3A
+#define __DAAL_BRNG_PHILOX4X32X10             VSL_BRNG_PHILOX4X32X10
 #define __DAAL_RNG_METHOD_UNIFORM_STD         VSL_RNG_METHOD_UNIFORM_STD
 #define __DAAL_RNG_METHOD_UNIFORMBITS32_STD   0
 #define __DAAL_RNG_METHOD_BERNOULLI_ICDF      VSL_RNG_METHOD_BERNOULLI_ICDF
@@ -32,6 +34,10 @@
 #define __DAAL_RNG_METHOD_GAUSSIAN_BOXMULLER2 VSL_RNG_METHOD_GAUSSIAN_BOXMULLER2
 #define __DAAL_RNG_METHOD_GAUSSIAN_ICDF       VSL_RNG_METHOD_GAUSSIAN_ICDF
 
+// Errors
+#define __DAAL_RNG_ERROR_LEAPFROG_UNSUPPORTED  -1002
+#define __DAAL_RNG_ERROR_SKIPAHEAD_UNSUPPORTED -1003
+
 namespace daal
 {
 namespace internal
diff --git a/cpp/daal/src/externals/service_rng_ref.h b/cpp/daal/src/externals/service_rng_ref.h
index fc56fcf6205..f2df997c87d 100644
--- a/cpp/daal/src/externals/service_rng_ref.h
+++ b/cpp/daal/src/externals/service_rng_ref.h
@@ -36,16 +36,22 @@
     #include <random>
 
     // RNGs
-    #define __DAAL_BRNG_MT2203  (1 << 20) * 9 //VSL_BRNG_MT2203
-    #define __DAAL_BRNG_MT19937 (1 << 20) * 8 //VSL_BRNG_MT19937
-    #define __DAAL_BRNG_MCG59   (1 << 20) * 4 //VSL_BRNG_MCG59
-
-    #define __DAAL_RNG_METHOD_UNIFORM_STD         0 //VSL_RNG_METHOD_UNIFORM_STD
-    #define __DAAL_RNG_METHOD_UNIFORMBITS32_STD   4
-    #define __DAAL_RNG_METHOD_BERNOULLI_ICDF      0 //VSL_RNG_METHOD_BERNOULLI_ICDF
-    #define __DAAL_RNG_METHOD_GAUSSIAN_BOXMULLER  0 //VSL_RNG_METHOD_GAUSSIAN_BOXMULLER
-    #define __DAAL_RNG_METHOD_GAUSSIAN_BOXMULLER2 1 //VSL_RNG_METHOD_GAUSSIAN_BOXMULLER2
-    #define __DAAL_RNG_METHOD_GAUSSIAN_ICDF       2 //VSL_RNG_METHOD_GAUSSIAN_ICDF
+    #define __DAAL_BRNG_MT2203        (1 << 20) * 9  //VSL_BRNG_MT2203
+    #define __DAAL_BRNG_MT19937       (1 << 20) * 8  //VSL_BRNG_MT19937
+    #define __DAAL_BRNG_MCG59         (1 << 20) * 4  //VSL_BRNG_MCG59
+    #define __DAAL_BRNG_MRG32K3A      (1 << 20) * 3  //VSL_BRNG_MRG32K3A
+    #define __DAAL_BRNG_PHILOX4X32X10 (1 << 20) * 16 //VSL_BRNG_PHILOX4X32X10
+
+    #define __DAAL_RNG_METHOD_UNIFORM_STD          0 //VSL_RNG_METHOD_UNIFORM_STD
+    #define __DAAL_RNG_METHOD_UNIFORMBITS32_STD    4
+    #define __DAAL_RNG_METHOD_BERNOULLI_ICDF       0 //VSL_RNG_METHOD_BERNOULLI_ICDF
+    #define __DAAL_RNG_METHOD_GAUSSIAN_BOXMULLER   0 //VSL_RNG_METHOD_GAUSSIAN_BOXMULLER
+    #define __DAAL_RNG_METHOD_GAUSSIAN_BOXMULLER2  1 //VSL_RNG_METHOD_GAUSSIAN_BOXMULLER2
+    #define __DAAL_RNG_METHOD_GAUSSIAN_ICDF        2 //VSL_RNG_METHOD_GAUSSIAN_ICDF
+
+    // Errors
+    #define __DAAL_RNG_ERROR_LEAPFROG_UNSUPPORTED  -1002
+    #define __DAAL_RNG_ERROR_SKIPAHEAD_UNSUPPORTED -1003
 
 namespace daal
 {
diff --git a/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp b/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp
index 4da1866e277..bdcc3f1487a 100644
--- a/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp
+++ b/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp
@@ -24,7 +24,7 @@
 #include "oneapi/dal/backend/memory.hpp"
 #include "oneapi/dal/backend/interop/common.hpp"
 #include "oneapi/dal/table/homogen.hpp"
-#include "oneapi/dal/backend/primitives/rng/rng_engine.hpp"
+#include "oneapi/dal/backend/primitives/rng/rng.hpp"
 #include "oneapi/dal/detail/threading.hpp"
 
 namespace oneapi::dal::preview::connected_components::backend {
@@ -90,9 +90,12 @@ std::int32_t most_frequent_element(const std::atomic<std::int32_t> *components,
                                    const std::int64_t &samples_count = 1024) {
     std::int32_t *rnd_vertex_ids = allocate(vertex_allocator, samples_count);
 
-    dal::backend::primitives::engine eng;
-    dal::backend::primitives::rng<std::int32_t> rn_gen;
-    rn_gen.uniform(samples_count, rnd_vertex_ids, eng.get_state(), 0, vertex_count);
+    dal::backend::primitives::host_engine eng;
+    dal::backend::primitives::uniform<std::int32_t>(samples_count,
+                                                    rnd_vertex_ids,
+                                                    eng,
+                                                    0,
+                                                    vertex_count);
 
     std::int32_t *root_sample_counts = allocate(vertex_allocator, vertex_count);
 
diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/infer_kernel_impl_dpc.cpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/infer_kernel_impl_dpc.cpp
index 19da49ffb74..23bcca39fde 100644
--- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/infer_kernel_impl_dpc.cpp
+++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/infer_kernel_impl_dpc.cpp
@@ -126,6 +126,7 @@ infer_kernel_impl<Float, Index, Task>::predict_by_tree_group_weighted(
     const Float* cls_prb_list_ptr = class_proba_list.get_data();
 
     Index obs_tree_group_response_count = ctx.class_count * ctx.tree_in_group_count;
+
     de::check_mul_overflow(ctx.row_count, obs_tree_group_response_count);
     auto [obs_response_list, zero_obs_response_event] =
         pr::ndarray<Float, 1>::zeros(queue_,
diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/infer_model_manager.hpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/infer_model_manager.hpp
index c0dd4cca565..770f73b5269 100644
--- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/infer_model_manager.hpp
+++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/infer_model_manager.hpp
@@ -87,7 +87,6 @@ class infer_model_manager {
         }
 
         max_tree_size_ = dal::detail::integral_cast<Index>(tree_size_max);
-
         const Index tree_block_size = dal::detail::check_mul_overflow(max_tree_size_, tree_count);
 
         auto fi_list_host = dal::backend::primitives::ndarray<Index, 1>::empty({ tree_block_size });
diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_feature_type_dpc.cpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_feature_type_dpc.cpp
index 7306533ed50..ee17b4ffabd 100644
--- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_feature_type_dpc.cpp
+++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_feature_type_dpc.cpp
@@ -20,6 +20,7 @@
 #include "oneapi/dal/table/row_accessor.hpp"
 #include "oneapi/dal/backend/memory.hpp"
 #include "oneapi/dal/detail/profiler.hpp"
+#include <iostream>
 
 #ifdef ONEDAL_DATA_PARALLEL
 
@@ -29,6 +30,12 @@ namespace de = dal::detail;
 namespace bk = dal::backend;
 namespace pr = dal::backend::primitives;
 
+template <typename Float>
+std::int64_t propose_block_size(const sycl::queue& q, const std::int64_t r) {
+    constexpr std::int64_t fsize = sizeof(Float);
+    return 0x10000l * (8 / fsize);
+}
+
 template <typename Float, typename Index>
 inline sycl::event sort_inplace(sycl::queue& queue_,
                                 pr::ndarray<Float, 1>& src,
@@ -56,18 +63,29 @@ sycl::event indexed_features<Float, Bin, Index>::extract_column(
     Float* values = values_nd.get_mutable_data();
     Index* indices = indices_nd.get_mutable_data();
     auto column_count = column_count_;
-
-    const sycl::range<1> range = de::integral_cast<std::size_t>(row_count_);
-
-    auto event = queue_.submit([&](sycl::handler& h) {
-        h.depends_on(deps);
-        h.parallel_for(range, [=](sycl::id<1> idx) {
-            values[idx] = data[idx * column_count + feature_id];
-            indices[idx] = idx;
+    const auto block_size = propose_block_size<Float>(queue_, row_count_);
+    const bk::uniform_blocking blocking(row_count_, block_size);
+
+    std::vector<sycl::event> events(blocking.get_block_count());
+    for (std::int64_t block_index = 0; block_index < blocking.get_block_count(); ++block_index) {
+        const auto first_row = blocking.get_block_start_index(block_index);
+        const auto last_row = blocking.get_block_end_index(block_index);
+        const auto curr_block = last_row - first_row;
+        ONEDAL_ASSERT(curr_block > 0);
+
+        auto event = queue_.submit([&](sycl::handler& cgh) {
+            cgh.depends_on(deps);
+            cgh.parallel_for<>(de::integral_cast<std::size_t>(curr_block), [=](sycl::id<1> idx) {
+                const std::int64_t row = idx + first_row;
+
+                values[row] = data[row * column_count + feature_id];
+                indices[row] = row;
+            });
         });
-    });
 
-    return event;
+        events.push_back(event);
+    }
+    return bk::wait_or_pass(events);
 }
 
 template <typename Float, typename Bin, typename Index>
diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp
index 9dfe252e849..d15f1704065 100644
--- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp
+++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021 Intel Corporation
+* Copyright 2021-2022 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -20,7 +20,8 @@
 #include "oneapi/dal/backend/primitives/ndarray.hpp"
 #include "oneapi/dal/backend/primitives/utils.hpp"
 #include "oneapi/dal/algo/decision_forest/train_types.hpp"
-
+#include "oneapi/dal/backend/primitives/rng/rng.hpp"
+#include "oneapi/dal/backend/primitives/rng/rng.hpp"
 #include "oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp"
 
 #include "oneapi/dal/algo/decision_forest/backend/gpu/train_misc_structs.hpp"
@@ -50,7 +51,7 @@ class train_kernel_hist_impl {
     using model_manager_t = train_model_manager<Float, Index, Task>;
     using train_context_t = train_context<Float, Index, Task>;
     using imp_data_t = impurity_data<Float, Index, Task>;
-    using rng_engine_t = pr::engine;
+    using rng_engine_t = pr::dpc_engine<pr::engine_method::philox4x32x10>;
     using rng_engine_list_t = std::vector<rng_engine_t>;
     using msg = dal::detail::error_messages;
     using comm_t = bk::communicator<spmd::device_memory_access::usm>;
@@ -62,7 +63,7 @@ class train_kernel_hist_impl {
     train_kernel_hist_impl(const bk::context_gpu& ctx)
             : queue_(ctx.get_queue()),
               comm_(ctx.get_communicator()),
-              train_service_kernels_(queue_) {}
+              train_service_kernels_(ctx.get_queue()) {}
     ~train_kernel_hist_impl() = default;
 
     result_t operator()(const descriptor_t& desc,
@@ -83,13 +84,11 @@ class train_kernel_hist_impl {
                                        pr::ndarray<Index, 1>& node_list,
                                        pr::ndarray<Index, 1>& tree_order_level,
                                        Index engine_offset,
-                                       Index node_count);
+                                       Index node_count,
+                                       const bk::event_vector& deps = {});
 
     void validate_input(const descriptor_t& desc, const table& data, const table& labels) const;
 
-    Index get_row_total_count(bool distr_mode, Index row_count);
-    Index get_global_row_offset(bool distr_mode, Index row_count);
-
     /// Initializes `ctx` training context structure based on data and
     /// descriptor class. Filling and calculating all parameters in context,
     /// for example, tree count, required memory size, calculating indexed features, etc.
@@ -149,6 +148,24 @@ class train_kernel_hist_impl {
                                                   Index node_count,
                                                   const bk::event_vector& deps = {});
 
+    sycl::event compute_initial_imp_for_node_list_regression(
+        const train_context_t& ctx,
+        const pr::ndarray<Index, 1>& node_list,
+        const pr::ndarray<Float, 1>& local_sum_hist,
+        const pr::ndarray<Float, 1>& local_sum2cent_hist,
+        imp_data_t& imp_data_list,
+        Index node_count,
+        const bk::event_vector& deps = {});
+
+    sycl::event compute_local_sum_histogram(const train_context_t& ctx,
+                                            const pr::ndarray<Float, 1>& response,
+                                            const pr::ndarray<Index, 1>& tree_order,
+                                            const pr::ndarray<Index, 1>& node_list,
+                                            pr::ndarray<Float, 1>& local_sum_hist,
+                                            pr::ndarray<Float, 1>& local_sum2cent_hist,
+                                            Index node_count,
+                                            const bk::event_vector& deps = {});
+
     /// Computes initial histograms for each node to compute impurity.
     ///
     /// @param[in] ctx              a training context structure for a GPU backend
@@ -575,7 +592,7 @@ class train_kernel_hist_impl {
                                 pr::ndarray<hist_type_t, 1>& oob_per_obs_list,
                                 pr::ndarray<Float, 1>& var_imp,
                                 pr::ndarray<Float, 1>& var_imp_variance,
-                                const rng_engine_list_t& rng_engine_arr,
+                                rng_engine_list_t& rng_engine_arr,
                                 Index tree_idx,
                                 Index tree_in_block,
                                 Index built_tree_count,
diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp
index fc875683784..69593a81321 100644
--- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp
+++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2021 Intel Corporation
+* Copyright 2021-2022 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 #include "oneapi/dal/table/row_accessor.hpp"
 #include "oneapi/dal/detail/profiler.hpp"
 #include "oneapi/dal/algo/decision_forest/backend/gpu/train_helpers.hpp"
-
+#include <iostream>
 #ifdef ONEDAL_DATA_PARALLEL
 
 #include "oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp"
@@ -91,47 +91,16 @@ void train_kernel_hist_impl<Float, Bin, Index, Task>::validate_input(const descr
     }
 }
 
-template <typename Float, typename Bin, typename Index, typename Task>
-Index train_kernel_hist_impl<Float, Bin, Index, Task>::get_row_total_count(bool distr_mode,
-                                                                           Index row_count) {
-    Index row_total_count = row_count;
-
-    if (distr_mode) {
-        ONEDAL_PROFILER_TASK(allreduce_row_total_count);
-        comm_.allreduce(row_total_count).wait();
-    }
-
-    return row_total_count;
-}
-
-template <typename Float, typename Bin, typename Index, typename Task>
-Index train_kernel_hist_impl<Float, Bin, Index, Task>::get_global_row_offset(bool distr_mode,
-                                                                             Index row_count) {
-    Index global_row_offset = 0;
-
-    if (distr_mode) {
-        auto row_count_list_host = pr::ndarray<Index, 1>::empty({ comm_.get_rank_count() });
-        Index* row_count_list_host_ptr = row_count_list_host.get_mutable_data();
-        {
-            ONEDAL_PROFILER_TASK(allgather_row_count);
-            comm_.allgather(row_count, row_count_list_host.flatten()).wait();
-        }
-
-        for (std::int64_t i = 0; i < comm_.get_rank(); ++i) {
-            global_row_offset += row_count_list_host_ptr[i];
-        }
-    }
-
-    return global_row_offset;
-}
-
 template <typename Float, typename Bin, typename Index, typename Task>
 void train_kernel_hist_impl<Float, Bin, Index, Task>::init_params(train_context_t& ctx,
                                                                   const descriptor_t& desc,
                                                                   const table& data,
                                                                   const table& responses,
                                                                   const table& weights) {
-    ctx.distr_mode_ = (comm_.get_rank_count() > 1);
+    ONEDAL_PROFILER_TASK(init_params, queue_);
+    std::int64_t rank_count = comm_.get_rank_count();
+    ctx.distr_mode_ = (rank_count > 1);
+    auto current_rank = comm_.get_rank();
 
     ctx.use_private_mem_buf_ = true;
 
@@ -143,7 +112,11 @@ void train_kernel_hist_impl<Float, Bin, Index, Task>::init_params(train_context_
     }
 
     ctx.row_count_ = de::integral_cast<Index>(data.get_row_count());
-    ctx.row_total_count_ = get_row_total_count(ctx.distr_mode_, ctx.row_count_);
+    ctx.row_total_count_ = ctx.row_count_;
+    {
+        ONEDAL_PROFILER_TASK(allreduce_total_row_count_exactly_it, queue_);
+        comm_.allreduce(ctx.row_total_count_, spmd::reduce_op::sum).wait();
+    }
 
     ctx.column_count_ = de::integral_cast<Index>(data.get_column_count());
 
@@ -151,10 +124,22 @@ void train_kernel_hist_impl<Float, Bin, Index, Task>::init_params(train_context_
     ctx.selected_row_count_ = ctx.distr_mode_
                                   ? impl_const_t::bad_val_
                                   : desc.get_observations_per_tree_fraction() * ctx.row_count_;
+
     ctx.selected_row_total_count_ =
         desc.get_observations_per_tree_fraction() * ctx.row_total_count_;
 
-    ctx.global_row_offset_ = get_global_row_offset(ctx.distr_mode_, ctx.row_count_);
+    auto global_rank_offsets = array<std::int64_t>::zeros(rank_count);
+    global_rank_offsets.get_mutable_data()[current_rank] = ctx.row_count_;
+    {
+        ONEDAL_PROFILER_TASK(allreduce_recv_counts, queue_);
+        comm_.allreduce(global_rank_offsets, spmd::reduce_op::sum).wait();
+    }
+
+    ctx.global_row_offset_ = 0;
+    for (std::int64_t i = 0; i < current_rank; i++) {
+        ONEDAL_ASSERT(global_rank_offsets.get_data()[i] >= 0);
+        ctx.global_row_offset_ += global_rank_offsets.get_data()[i];
+    }
 
     ctx.tree_count_ = de::integral_cast<Index>(desc.get_tree_count());
 
@@ -211,7 +196,7 @@ void train_kernel_hist_impl<Float, Bin, Index, Task>::init_params(train_context_
         bin_borders_host_[clmn_idx] = ind_ftrs.get_bin_borders(clmn_idx).to_host(queue_);
     }
 
-    data_host_ = pr::table2ndarray_1d<Float>(queue_, data, alloc::device).to_host(queue_);
+    data_host_ = pr::table2ndarray_1d<Float>(queue_, data, alloc::host);
 
     response_nd_ = pr::table2ndarray_1d<Float>(queue_, responses, alloc::device);
 
@@ -332,7 +317,9 @@ void train_kernel_hist_impl<Float, Bin, Index, Task>::init_params(train_context_
 
 template <typename Float, typename Bin, typename Index, typename Task>
 void train_kernel_hist_impl<Float, Bin, Index, Task>::allocate_buffers(const train_context_t& ctx) {
-    de::check_mul_overflow(ctx.selected_row_total_count_, ctx.tree_in_block_);
+    ONEDAL_PROFILER_TASK(allocate_buffers, queue_);
+    de::check_mul_overflow(ctx.selected_row_total_count_,
+                           static_cast<std::int64_t>(ctx.tree_in_block_));
 
     // main tree order and auxilliary one are used for partitioning
     tree_order_lev_ =
@@ -372,7 +359,8 @@ sycl::event train_kernel_hist_impl<Float, Bin, Index, Task>::gen_initial_tree_or
     pr::ndarray<Index, 1>& node_list_host,
     pr::ndarray<Index, 1>& tree_order_level,
     Index engine_offset,
-    Index node_count) {
+    Index node_count,
+    const bk::event_vector& deps) {
     ONEDAL_PROFILER_TASK(gen_initial_tree_order, queue_);
 
     ONEDAL_ASSERT(node_list_host.get_count() == node_count * impl_const_t::node_prop_count_);
@@ -382,50 +370,74 @@ sycl::event train_kernel_hist_impl<Float, Bin, Index, Task>::gen_initial_tree_or
     sycl::event last_event;
 
     if (ctx.bootstrap_) {
-        auto selected_row_global_host =
-            pr::ndarray<Index, 1>::empty({ ctx.selected_row_total_count_ * ctx.tree_in_block_ });
-        pr::ndarray<Index, 1> selected_row_host;
+        auto selected_row_global =
+            pr::ndarray<Index, 1>::empty(queue_,
+                                         { ctx.selected_row_total_count_ * ctx.tree_in_block_ },
+                                         alloc::device);
+        pr::ndarray<Index, 1> selected_row;
         if (ctx.distr_mode_) {
-            selected_row_host = pr::ndarray<Index, 1>::empty(
-                { ctx.selected_row_total_count_ * ctx.tree_in_block_ });
+            selected_row =
+                pr::ndarray<Index, 1>::empty(queue_,
+                                             { ctx.selected_row_total_count_ * ctx.tree_in_block_ },
+                                             alloc::device);
         }
 
-        Index* const selected_row_global_ptr = selected_row_global_host.get_mutable_data();
-        Index* const selected_row_ptr =
-            ctx.distr_mode_ ? selected_row_host.get_mutable_data() : nullptr;
+        Index* const selected_row_global_ptr = selected_row_global.get_mutable_data();
+        Index* const selected_row_ptr = ctx.distr_mode_ ? selected_row.get_mutable_data() : nullptr;
         Index* const node_list_ptr = node_list_host.get_mutable_data();
 
         for (Index node_idx = 0; node_idx < node_count; ++node_idx) {
-            pr::rng<Index> rn_gen;
             Index* gen_row_idx_global_ptr =
                 selected_row_global_ptr + ctx.selected_row_total_count_ * node_idx;
-            rn_gen.uniform(ctx.selected_row_total_count_,
-                           gen_row_idx_global_ptr,
-                           rng_engine_list[engine_offset + node_idx].get_state(),
-                           0,
-                           ctx.row_total_count_);
+            uniform(queue_,
+                    ctx.selected_row_total_count_,
+                    gen_row_idx_global_ptr,
+                    rng_engine_list[engine_offset + node_idx],
+                    0,
+                    ctx.row_total_count_,
+                    { deps });
 
             if (ctx.distr_mode_) {
                 Index* node_ptr = node_list_ptr + node_idx * impl_const_t::node_prop_count_;
-                Index* src = gen_row_idx_global_ptr;
 
                 Index* const dst = selected_row_ptr + ctx.selected_row_total_count_ * node_idx;
 
-                Index row_idx = 0;
-                for (Index i = 0; i < ctx.selected_row_total_count_; ++i) {
-                    dst[i] = 0;
-                    if (src[i] >= ctx.global_row_offset_ &&
-                        src[i] < (ctx.global_row_offset_ + ctx.row_count_)) {
-                        dst[row_idx++] = src[i] - ctx.global_row_offset_;
-                    }
-                }
-                node_ptr[impl_const_t::ind_lrc] = row_idx;
+                auto [row_index, row_index_event] =
+                    pr::ndarray<Index, 1>::full(queue_, 1, 0, alloc::device);
+                row_index_event.wait_and_throw();
+                Index* row_idx_ptr = row_index.get_mutable_data();
+                const sycl::nd_range<1> nd_range =
+                    bk::make_multiple_nd_range_1d(ctx.selected_row_total_count_, 1);
+                auto event_ = queue_.submit([&](sycl::handler& cgh) {
+                    cgh.depends_on({ last_event });
+                    cgh.parallel_for(nd_range, [=](sycl::nd_item<1> id) {
+                        auto idx = id.get_global_id(0);
+                        dst[idx] = 0;
+                        if (gen_row_idx_global_ptr[idx] >= ctx.global_row_offset_ &&
+                            gen_row_idx_global_ptr[idx] <
+                                (ctx.global_row_offset_ + ctx.row_count_)) {
+                            sycl::atomic_ref<
+                                Index,
+                                sycl::memory_order::relaxed,
+                                sycl::memory_scope::device,
+                                sycl::access::address_space::ext_intel_global_device_space>
+                                counter_atomic(row_idx_ptr[0]);
+                            auto cur_idx = counter_atomic.fetch_add(1);
+                            dst[cur_idx] = gen_row_idx_global_ptr[idx] - ctx.global_row_offset_;
+                        }
+                    });
+                });
+                auto set_event = queue_.submit([&](sycl::handler& cgh) {
+                    cgh.depends_on(event_);
+                    cgh.parallel_for(sycl::range<1>{ std::size_t(1) }, [=](sycl::id<1> idx) {
+                        node_ptr[impl_const_t::ind_lrc] = row_idx_ptr[0];
+                    });
+                });
+                set_event.wait_and_throw();
             }
         }
 
-        last_event = ctx.distr_mode_
-                         ? tree_order_level.assign_from_host(queue_, selected_row_host)
-                         : tree_order_level.assign_from_host(queue_, selected_row_global_host);
+        ctx.distr_mode_ ? tree_order_level = selected_row : tree_order_level = selected_row_global;
     }
     else {
         Index row_count = ctx.selected_row_count_;
@@ -433,29 +445,30 @@ sycl::event train_kernel_hist_impl<Float, Bin, Index, Task>::gen_initial_tree_or
         if (ctx.distr_mode_) {
             row_count = 0;
             if (ctx.global_row_offset_ < ctx.selected_row_total_count_) {
-                row_count = std::min(ctx.selected_row_total_count_ - ctx.global_row_offset_,
+                row_count = std::min(static_cast<std::int32_t>(ctx.selected_row_total_count_ -
+                                                               ctx.global_row_offset_),
                                      ctx.row_count_);
             }
             // in case of no bootstrap
             // it is valid case if this worker's rows set wasn't taken for tree build
             // i.e. row_count can be eq 0
-
             Index* node_list_ptr = node_list_host.get_mutable_data();
-
-            for (Index node_idx = 0; node_idx < node_count; ++node_idx) {
-                Index* node_ptr = node_list_ptr + node_idx * impl_const_t::node_prop_count_;
-                node_ptr[impl_const_t::ind_lrc] = row_count;
+            auto set_event = queue_.submit([&](sycl::handler& cgh) {
+                cgh.parallel_for(sycl::range<1>{ std::size_t(node_count) }, [=](sycl::id<1> idx) {
+                    Index* node_ptr = node_list_ptr + idx * impl_const_t::node_prop_count_;
+                    node_ptr[impl_const_t::ind_lrc] = row_count;
+                });
+            });
+            set_event.wait_and_throw();
+
+            if (row_count > 0) {
+                last_event = train_service_kernels_.initialize_tree_order(tree_order_level,
+                                                                          node_count,
+                                                                          row_count,
+                                                                          stride);
             }
         }
-
-        if (row_count > 0) {
-            last_event = train_service_kernels_.initialize_tree_order(tree_order_level,
-                                                                      node_count,
-                                                                      row_count,
-                                                                      stride);
-        }
     }
-
     return last_event;
 }
 
@@ -469,7 +482,6 @@ train_kernel_hist_impl<Float, Bin, Index, Task>::gen_feature_list(
     ONEDAL_PROFILER_TASK(gen_feature_list, queue_);
 
     ONEDAL_ASSERT(node_vs_tree_map_list.get_count() == node_count);
-
     de::check_mul_overflow((node_count + 1), ctx.selected_ftr_count_);
     // first part is used for features indices, +1 block - part for generator
     auto selected_features_host =
@@ -479,36 +491,43 @@ train_kernel_hist_impl<Float, Bin, Index, Task>::gen_feature_list(
                                      { node_count * ctx.selected_ftr_count_ },
                                      alloc::device);
 
-    auto selected_features_host_ptr = selected_features_host.get_mutable_data();
-
     auto node_vs_tree_map_list_host = node_vs_tree_map_list.to_host(queue_);
 
-    pr::rng<Index> rn_gen;
-    auto tree_map_ptr = node_vs_tree_map_list_host.get_mutable_data();
     if (ctx.selected_ftr_count_ != ctx.column_count_) {
+        auto tree_map_ptr = node_vs_tree_map_list_host.get_mutable_data();
+        auto selected_features_host_ptr = selected_features_host.get_mutable_data();
         for (Index node = 0; node < node_count; ++node) {
-            rn_gen.uniform_without_replacement(
+            pr::uniform_without_replacement<Index>(
+                queue_,
                 ctx.selected_ftr_count_,
                 selected_features_host_ptr + node * ctx.selected_ftr_count_,
                 selected_features_host_ptr + (node + 1) * ctx.selected_ftr_count_,
-                rng_engine_list[tree_map_ptr[node]].get_state(),
+                rng_engine_list[tree_map_ptr[node]],
                 0,
                 ctx.column_count_);
         }
+        auto event = selected_features_com.assign_from_host(queue_,
+                                                            selected_features_host_ptr,
+                                                            selected_features_com.get_count());
+
+        return std::tuple{ selected_features_com, event };
     }
     else {
+        sycl::event fill_event;
         for (Index node = 0; node < node_count; ++node) {
-            for (Index i = 0; i < ctx.selected_ftr_count_; ++i) {
-                selected_features_host_ptr[node * ctx.selected_ftr_count_ + i] = i;
-            }
+            auto selected_features_host_ptr = selected_features_com.get_mutable_data();
+
+            fill_event = queue_.submit([&](sycl::handler& cgh) {
+                cgh.parallel_for(
+                    sycl::range<1>{ std::size_t(ctx.selected_ftr_count_) },
+                    [=](sycl::id<1> idx) {
+                        selected_features_host_ptr[node * ctx.selected_ftr_count_ + idx] = idx;
+                    });
+            });
         }
-    }
 
-    auto event = selected_features_com.assign_from_host(queue_,
-                                                        selected_features_host_ptr,
-                                                        selected_features_com.get_count());
-
-    return std::tuple{ selected_features_com, event };
+        return std::tuple{ selected_features_com, fill_event };
+    }
 }
 
 template <typename Float, typename Bin, typename Index, typename Task>
@@ -524,7 +543,6 @@ train_kernel_hist_impl<Float, Bin, Index, Task>::gen_random_thresholds(
 
     auto node_vs_tree_map_list_host = node_vs_tree_map.to_host(queue_);
 
-    pr::rng<Float> rn_gen;
     auto tree_map_ptr = node_vs_tree_map_list_host.get_mutable_data();
 
     // Create arrays for random generated bins
@@ -537,11 +555,12 @@ train_kernel_hist_impl<Float, Bin, Index, Task>::gen_random_thresholds(
 
     // Generate random bins for selected features
     for (Index node = 0; node < node_count; ++node) {
-        rn_gen.uniform(ctx.selected_ftr_count_,
-                       random_bins_host_ptr + node * ctx.selected_ftr_count_,
-                       rng_engine_list[tree_map_ptr[node]].get_state(),
-                       0.0f,
-                       1.0f);
+        pr::uniform<Float>(queue_,
+                           ctx.selected_ftr_count_,
+                           random_bins_host_ptr + node * ctx.selected_ftr_count_,
+                           rng_engine_list[tree_map_ptr[node]],
+                           0.0f,
+                           1.0f);
     }
     auto event_rnd_generate =
         random_bins_com.assign_from_host(queue_, random_bins_host_ptr, random_bins_com.get_count());
@@ -758,54 +777,50 @@ sycl::event train_kernel_hist_impl<Float, Bin, Index, Task>::compute_initial_imp
     pr::ndarray<Index, 1>& node_list,
     Index node_count,
     const bk::event_vector& deps) {
+    ONEDAL_PROFILER_TASK(compute_initial_imp_for_node_list, queue_);
     ONEDAL_ASSERT(imp_data_list.imp_list_.get_count() ==
                   node_count * impl_const_t::node_imp_prop_count_);
-    if constexpr (std::is_same_v<task::classification, Task>) {
-        ONEDAL_ASSERT(imp_data_list.class_hist_list_.get_count() == node_count * ctx.class_count_);
-    }
     ONEDAL_ASSERT(node_list.get_count() == node_count * impl_const_t::node_prop_count_);
+    sycl::event event_;
 
     if constexpr (std::is_same_v<task::classification, Task>) {
-        auto class_hist_list_host = imp_data_list.class_hist_list_.to_host(queue_, deps);
-        auto imp_list_host = imp_data_list.imp_list_.to_host(queue_);
-        auto node_list_host = node_list.to_host(queue_);
+        const Index* class_hist_list_ptr = imp_data_list.class_hist_list_.get_data();
+        Float* imp_list_ptr = imp_data_list.imp_list_.get_mutable_data();
+        Index* node_list_ptr = node_list.get_mutable_data();
+
+        // Launch kernel to compute impurity and winning class for each node
+        auto event_ = queue_.submit([&](sycl::handler& cgh) {
+            cgh.depends_on(deps);
+            cgh.parallel_for(sycl::range<1>(node_count), [=](sycl::id<1> idx) {
+                Index node_idx = idx;
+                const Index* node_histogram_ptr = class_hist_list_ptr + node_idx * ctx.class_count_;
+                Float* node_imp_ptr = imp_list_ptr + node_idx * impl_const_t::node_imp_prop_count_;
+                Index* node_ptr = node_list_ptr + node_idx * impl_const_t::node_prop_count_;
 
-        const Index* class_hist_list_host_ptr = class_hist_list_host.get_data();
-        Float* imp_list_host_ptr = imp_list_host.get_mutable_data();
-        Index* node_list_host_ptr = node_list_host.get_mutable_data();
+                Index row_count = node_ptr[impl_const_t::ind_grc];
 
-        for (Index node_idx = 0; node_idx < node_count; ++node_idx) {
-            const Index* node_histogram_ptr =
-                class_hist_list_host_ptr + node_idx * ctx.class_count_;
-            Float* node_imp_ptr = imp_list_host_ptr + node_idx * impl_const_t::node_imp_prop_count_;
-            Index* node_ptr = node_list_host_ptr + node_idx * impl_const_t::node_prop_count_;
-
-            Index row_count = node_ptr[impl_const_t::ind_grc];
-
-            Float imp = Float(1);
-            Float div = Float(1) / (Float(row_count) * row_count);
-            Index max_cls_count = 0;
-            Index win_cls = 0;
-            Index cls_count = 0;
-
-            for (Index cls_idx = 0; cls_idx < ctx.class_count_; ++cls_idx) {
-                cls_count = node_histogram_ptr[cls_idx];
-                imp -= Float(cls_count) * (cls_count)*div;
-
-                if (cls_count > max_cls_count) {
-                    max_cls_count = cls_count;
-                    win_cls = cls_idx;
-                }
-            }
+                Float imp = Float(1);
+                Float div = Float(1) / (Float(row_count) * row_count);
+                Index max_cls_count = 0;
+                Index win_cls = 0;
+                Index cls_count = 0;
 
-            node_ptr[impl_const_t::ind_win] = win_cls;
-            node_imp_ptr[0] = sycl::max(imp, Float(0));
-        }
-        imp_data_list.imp_list_.assign_from_host(queue_, imp_list_host).wait_and_throw();
-        node_list.assign_from_host(queue_, node_list_host).wait_and_throw();
+                for (Index cls_idx = 0; cls_idx < ctx.class_count_; ++cls_idx) {
+                    cls_count = node_histogram_ptr[cls_idx];
+                    imp -= cls_count * cls_count * div;
+
+                    if (cls_count > max_cls_count) {
+                        max_cls_count = cls_count;
+                        win_cls = cls_idx;
+                    }
+                }
+                node_ptr[impl_const_t::ind_win] = win_cls;
+                node_imp_ptr[0] = sycl::max(imp, Float(0));
+            });
+        });
     }
 
-    return sycl::event{};
+    return event_;
 }
 
 template <typename Float, typename Bin, typename Index, typename Task>
@@ -994,6 +1009,136 @@ Float* local_buf_ptr = local_buf.get_pointer().get();
     return event;
 }
 
+template <typename Float, typename Bin, typename Index, typename Task>
+sycl::event train_kernel_hist_impl<Float, Bin, Index, Task>::compute_local_sum_histogram(
+    const train_context_t& ctx,
+    const pr::ndarray<Float, 1>& response,
+    const pr::ndarray<Index, 1>& tree_order,
+    const pr::ndarray<Index, 1>& node_list,
+    pr::ndarray<Float, 1>& local_sum_hist,
+    pr::ndarray<Float, 1>& local_sum2cent_hist,
+    Index node_count,
+    const bk::event_vector& deps) {
+    ONEDAL_ASSERT(response.get_count() == ctx.row_count_);
+    ONEDAL_ASSERT(tree_order.get_count() == ctx.tree_in_block_ * ctx.selected_row_total_count_);
+    ONEDAL_ASSERT(node_list.get_count() == node_count * impl_const_t::node_prop_count_);
+    ONEDAL_ASSERT(local_sum_hist.get_count() == node_count);
+    ONEDAL_ASSERT(local_sum2cent_hist.get_count() == node_count);
+
+    auto fill_event1 = local_sum_hist.fill(queue_, 0, deps);
+    auto fill_event2 = local_sum2cent_hist.fill(queue_, 0, deps);
+
+    fill_event1.wait_and_throw();
+    fill_event2.wait_and_throw();
+
+    const Float* response_ptr = response.get_data();
+    const Index* tree_order_ptr = tree_order.get_data();
+    const Index* node_list_ptr = node_list.get_data();
+    Float* local_sum_hist_ptr = local_sum_hist.get_mutable_data();
+    Float* local_sum2cent_hist_ptr = local_sum2cent_hist.get_mutable_data();
+
+    const Index node_prop_count = impl_const_t::node_prop_count_;
+
+    auto local_size = ctx.preferable_group_size_;
+    const sycl::nd_range<2> nd_range =
+        bk::make_multiple_nd_range_2d({ local_size, node_count }, { local_size, 1 });
+
+    auto event = queue_.submit([&](sycl::handler& cgh) {
+        cgh.depends_on(deps);
+        local_accessor_rw_t<Float> local_sum_buf(local_size, cgh);
+        local_accessor_rw_t<Float> local_sum2cent_buf(local_size, cgh);
+        cgh.parallel_for(nd_range, [=](sycl::nd_item<2> item) {
+            const Index node_id = item.get_global_id()[1];
+            const Index local_id = item.get_local_id()[0];
+            const Index local_size = item.get_local_range()[0];
+
+            const Index* node_ptr = node_list_ptr + node_id * node_prop_count;
+
+            const Index row_offset = node_ptr[impl_const_t::ind_ofs];
+            const Index row_count = node_ptr[impl_const_t::ind_lrc];
+
+            const Index* node_tree_order_ptr = &tree_order_ptr[row_offset];
+#if __SYCL_COMPILER_VERSION >= 20230828
+            Float* local_sum_buf_ptr =
+                local_sum_buf.template get_multi_ptr<sycl::access::decorated::yes>().get_raw();
+            Float* local_sum2cent_buf_ptr =
+                local_sum2cent_buf.template get_multi_ptr<sycl::access::decorated::yes>().get_raw();
+#else
+            Float* local_sum_buf_ptr = local_sum_buf.get_pointer().get();
+            Float* local_sum2cent_buf_ptr = local_sum2cent_buf.get_pointer().get();
+#endif
+            Float local_sum = Float(0);
+            Float local_sum2cent = Float(0);
+            for (Index i = local_id; i < row_count; i += local_size) {
+                Float value = response_ptr[node_tree_order_ptr[i]];
+                local_sum += value;
+                local_sum2cent += value * value;
+            }
+
+            local_sum_buf_ptr[local_id] = local_sum;
+            local_sum2cent_buf_ptr[local_id] = local_sum2cent;
+
+            for (Index offset = local_size / 2; offset > 0; offset >>= 1) {
+                item.barrier(sycl::access::fence_space::local_space);
+                if (local_id < offset) {
+                    local_sum_buf_ptr[local_id] += local_sum_buf_ptr[local_id + offset];
+                    local_sum2cent_buf_ptr[local_id] += local_sum2cent_buf_ptr[local_id + offset];
+                }
+            }
+
+            if (local_id == 0) {
+                local_sum_hist_ptr[node_id] = local_sum_buf_ptr[local_id];
+                local_sum2cent_hist_ptr[node_id] = local_sum2cent_buf_ptr[local_id];
+            }
+        });
+    });
+
+    event.wait_and_throw();
+    return event;
+}
+
+template <typename Float, typename Bin, typename Index, typename Task>
+sycl::event
+train_kernel_hist_impl<Float, Bin, Index, Task>::compute_initial_imp_for_node_list_regression(
+    const train_context_t& ctx,
+    const pr::ndarray<Index, 1>& node_list,
+    const pr::ndarray<Float, 1>& local_sum_hist,
+    const pr::ndarray<Float, 1>& local_sum2cent_hist,
+    imp_data_t& imp_data_list,
+    Index node_count,
+    const bk::event_vector& deps) {
+    ONEDAL_ASSERT(node_list.get_count() == node_count * impl_const_t::node_prop_count_);
+    ONEDAL_ASSERT(local_sum_hist.get_count() == node_count);
+    ONEDAL_ASSERT(local_sum2cent_hist.get_count() == node_count);
+    ONEDAL_ASSERT(imp_data_list.imp_list_.get_count() ==
+                  node_count * impl_const_t::node_imp_prop_count_);
+
+    const Index* node_list_ptr = node_list.get_data();
+    const Float* local_sum_hist_ptr = local_sum_hist.get_data();
+    const Float* local_sum2cent_hist_ptr = local_sum2cent_hist.get_data();
+    Float* imp_list_ptr = imp_data_list.imp_list_.get_mutable_data();
+
+    const sycl::range<1> range{ de::integral_cast<std::size_t>(node_count) };
+
+    auto last_event = queue_.submit([&](sycl::handler& cgh) {
+        cgh.depends_on(deps);
+        cgh.parallel_for(range, [=](sycl::id<1> node_idx) {
+            // set mean
+            imp_list_ptr[node_idx * impl_const_t::node_imp_prop_count_ + 0] =
+                local_sum_hist_ptr[node_idx] /
+                node_list_ptr[node_idx * impl_const_t::node_prop_count_ + impl_const_t::ind_grc];
+            // set sum2cent
+            imp_list_ptr[node_idx * impl_const_t::node_imp_prop_count_ + 1] =
+                local_sum2cent_hist_ptr[node_idx] -
+                (local_sum_hist_ptr[node_idx] * local_sum_hist_ptr[node_idx]) /
+                    node_list_ptr[node_idx * impl_const_t::node_prop_count_ +
+                                  impl_const_t::ind_grc];
+        });
+    });
+
+    return last_event;
+}
+
 template <typename Float, typename Bin, typename Index, typename Task>
 sycl::event train_kernel_hist_impl<Float, Bin, Index, Task>::compute_initial_sum2cent_local(
     const train_context_t& ctx,
@@ -1135,8 +1280,8 @@ sycl::event train_kernel_hist_impl<Float, Bin, Index, Task>::compute_initial_his
 
     sycl::event last_event;
 
-    if (ctx.distr_mode_) {
-        if constexpr (std::is_same_v<Task, task::classification>) {
+    if constexpr (std::is_same_v<Task, task::classification>) {
+        if (ctx.distr_mode_) {
             last_event = compute_initial_histogram_local(ctx,
                                                          response,
                                                          tree_order,
@@ -1156,49 +1301,44 @@ sycl::event train_kernel_hist_impl<Float, Bin, Index, Task>::compute_initial_his
                                                            { last_event });
         }
         else {
-            auto sum_list = pr::ndarray<Float, 1>::empty(queue_, { node_count });
-            auto sum2cent_list = pr::ndarray<Float, 1>::empty(queue_, { node_count });
-            last_event = compute_initial_sum_local(ctx,
-                                                   response,
-                                                   tree_order,
-                                                   node_list,
-                                                   sum_list,
-                                                   node_count,
-                                                   deps);
-            {
-                ONEDAL_PROFILER_TASK(sum_list, queue_);
-                comm_.allreduce(sum_list.flatten(queue_, { last_event })).wait();
-            }
-            last_event = compute_initial_sum2cent_local(ctx,
-                                                        response,
-                                                        tree_order,
-                                                        node_list,
-                                                        sum_list,
-                                                        sum2cent_list,
-                                                        node_count,
-                                                        { last_event });
-            {
-                ONEDAL_PROFILER_TASK(allreduce_sum2cent_list, queue_);
-                comm_.allreduce(sum2cent_list.flatten(queue_, { last_event })).wait();
-            }
-            last_event = fin_initial_imp(ctx,
-                                         node_list,
-                                         sum_list,
-                                         sum2cent_list,
-                                         imp_data_list,
-                                         node_count,
-                                         { last_event });
+            last_event = compute_initial_histogram_local(ctx,
+                                                         response,
+                                                         tree_order,
+                                                         node_list,
+                                                         imp_data_list,
+                                                         node_count,
+                                                         deps);
             last_event.wait_and_throw();
         }
     }
     else {
-        last_event = compute_initial_histogram_local(ctx,
-                                                     response,
-                                                     tree_order,
-                                                     node_list,
-                                                     imp_data_list,
-                                                     node_count,
-                                                     deps);
+        auto local_sum_hist = pr::ndarray<Float, 1>::empty(queue_, { node_count });
+        auto local_sum2cent_hist = pr::ndarray<Float, 1>::empty(queue_, { node_count });
+
+        last_event = compute_local_sum_histogram(ctx,
+                                                 response,
+                                                 tree_order,
+                                                 node_list,
+                                                 local_sum_hist,
+                                                 local_sum2cent_hist,
+                                                 node_count,
+                                                 deps);
+        {
+            ONEDAL_PROFILER_TASK(allreduce_sum_hist, queue_);
+            comm_.allreduce(local_sum_hist.flatten(queue_, { last_event })).wait();
+        }
+        {
+            ONEDAL_PROFILER_TASK(allreduce_sum2cent_hist, queue_);
+            comm_.allreduce(local_sum2cent_hist.flatten(queue_, { last_event })).wait();
+        }
+
+        last_event = compute_initial_imp_for_node_list_regression(ctx,
+                                                                  node_list,
+                                                                  local_sum_hist,
+                                                                  local_sum2cent_hist,
+                                                                  imp_data_list,
+                                                                  node_count,
+                                                                  { last_event });
         last_event.wait_and_throw();
     }
 
@@ -1409,8 +1549,6 @@ sycl::event train_kernel_hist_impl<Float, Bin, Index, Task>::do_node_split(
     const Index* node_list_ptr = node_list.get_data();
     const Index* node_vs_tree_map_list_ptr = node_vs_tree_map_list.get_data();
 
-    const bool distr_mode = ctx.distr_mode_;
-
     Index* node_list_new_ptr = node_list_new.get_mutable_data();
     Index* node_vs_tree_map_list_new_ptr = node_vs_tree_map_list_new.get_mutable_data();
 
@@ -1449,7 +1587,7 @@ sycl::event train_kernel_hist_impl<Float, Bin, Index, Task>::do_node_split(
                     Index* node_rch = node_list_new_ptr + (new_left_node_pos + 1) * node_prop_count;
 
                     node_lch[impl_const_t::ind_ofs] = node_prn[impl_const_t::ind_ofs];
-                    node_lch[impl_const_t::ind_lrc] = distr_mode
+                    node_lch[impl_const_t::ind_lrc] = ctx.distr_mode_
                                                           ? node_prn[impl_const_t::ind_lch_lrc]
                                                           : node_prn[impl_const_t::ind_lch_grc];
                     node_lch[impl_const_t::ind_grc] = node_prn[impl_const_t::ind_lch_grc];
@@ -1613,7 +1751,7 @@ sycl::event train_kernel_hist_impl<Float, Bin, Index, Task>::compute_results(
     pr::ndarray<hist_type_t, 1>& oob_per_obs_list,
     pr::ndarray<Float, 1>& var_imp,
     pr::ndarray<Float, 1>& var_imp_variance,
-    const rng_engine_list_t& engine_arr,
+    rng_engine_list_t& engine_arr,
     Index tree_idx_in_block,
     Index tree_in_block_count,
     Index built_tree_count,
@@ -1660,12 +1798,12 @@ sycl::event train_kernel_hist_impl<Float, Bin, Index, Task>::compute_results(
 
             const Float div1 = Float(1) / Float(built_tree_count + tree_idx_in_block + 1);
 
-            pr::rng<Index> rn_gen;
-
             for (Index column_idx = 0; column_idx < ctx.column_count_; ++column_idx) {
-                rn_gen.shuffle(oob_row_count,
-                               permutation_ptr,
-                               engine_arr[built_tree_count + tree_idx_in_block].get_state());
+                pr::shuffle<Index>(queue_,
+                                   oob_row_count,
+                                   permutation_ptr,
+                                   engine_arr[built_tree_count + tree_idx_in_block],
+                                   {});
                 const Float oob_err_perm = compute_oob_error_perm(ctx,
                                                                   model_manager,
                                                                   data_host,
@@ -1858,10 +1996,12 @@ train_result<Task> train_kernel_hist_impl<Float, Bin, Index, Task>::operator()(
 
     de::check_mul_overflow<std::size_t>((ctx.tree_count_ - 1), skip_num);
 
-    pr::engine_collection collection(ctx.tree_count_, desc.get_seed());
-    rng_engine_list_t engine_arr = collection([&](std::size_t i, std::size_t& skip) {
-        skip = i * skip_num;
-    });
+    pr::engine_collection_oneapi<std::int64_t, pr::engine_method::philox4x32x10> collection(
+        queue_,
+        ctx.tree_count_,
+        desc.get_seed());
+
+    rng_engine_list_t engine_arr = collection.get_engines();
 
     pr::ndarray<Float, 1> node_imp_decrease_list;
 
@@ -1882,49 +2022,55 @@ train_result<Task> train_kernel_hist_impl<Float, Bin, Index, Task>::operator()(
         imp_data_mng_t imp_data_holder(queue_, ctx);
         // initilizing imp_list and class_hist_list (for classification)
         imp_data_holder.init_new_level(node_count);
-
         de::check_mul_overflow(node_count, impl_const_t::node_prop_count_);
         de::check_mul_overflow(node_count, impl_const_t::node_imp_prop_count_);
-        auto node_vs_tree_map_list_host = pr::ndarray<Index, 1>::empty({ node_count });
-        auto level_node_list_init_host =
-            pr::ndarray<Index, 1>::empty({ node_count * impl_const_t::node_prop_count_ });
-
-        auto tree_map = node_vs_tree_map_list_host.get_mutable_data();
-        auto node_list_ptr = level_node_list_init_host.get_mutable_data();
-
-        for (Index node = 0; node < node_count; ++node) {
-            Index* node_ptr = node_list_ptr + node * impl_const_t::node_prop_count_;
-            tree_map[node] = iter + node;
-            node_ptr[impl_const_t::ind_ofs] =
-                ctx.selected_row_total_count_ * node; // local row offset
-            node_ptr[impl_const_t::ind_lrc] =
-                ctx.distr_mode_
-                    ? 0
-                    : ctx.selected_row_count_; // for distr_mode it will be updated during gen_initial_tree_order
-            node_ptr[impl_const_t::ind_grc] =
-                ctx.selected_row_total_count_; // global selected rows - it is already filtered for current block
-            node_ptr[impl_const_t::ind_lch_lrc] =
-                0; // for distr_mode it will be updated during tree_order_gen
-            node_ptr[impl_const_t::ind_fid] = impl_const_t::bad_val_;
-        }
+        auto node_vs_tree_map_list =
+            pr::ndarray<Index, 1>::empty(queue_, { node_count }, alloc::device);
+        auto level_node_list_init =
+            pr::ndarray<Index, 1>::empty(queue_,
+                                         { node_count * impl_const_t::node_prop_count_ },
+                                         alloc::device);
+
+        auto tree_map = node_vs_tree_map_list.get_mutable_data();
+        auto node_list_ptr = level_node_list_init.get_mutable_data();
+
+        auto fill_event = queue_.submit([&](sycl::handler& cgh) {
+            cgh.depends_on({ last_event });
+            cgh.parallel_for(sycl::range<1>{ std::size_t(node_count) }, [=](sycl::id<1> node) {
+                Index* node_ptr = node_list_ptr + node * impl_const_t::node_prop_count_;
+                tree_map[node] = iter + node;
+                node_ptr[impl_const_t::ind_ofs] =
+                    ctx.selected_row_total_count_ * node; // local row offset
+                node_ptr[impl_const_t::ind_lrc] =
+                    ctx.distr_mode_
+                        ? 0
+                        : ctx.selected_row_count_; // for distr_mode it will be updated during gen_initial_tree_order
+                node_ptr[impl_const_t::ind_grc] =
+                    ctx.selected_row_total_count_; // global selected rows - it is already filtered for current block
+                node_ptr[impl_const_t::ind_lch_lrc] =
+                    0; // for distr_mode it will be updated during tree_order_gen
+                node_ptr[impl_const_t::ind_fid] = impl_const_t::bad_val_;
+            });
+        });
 
-        last_event = gen_initial_tree_order(ctx,
-                                            engine_arr,
-                                            level_node_list_init_host,
-                                            tree_order_lev_,
-                                            iter,
-                                            node_count);
-
-        auto node_vs_tree_map_list = node_vs_tree_map_list_host.to_device(queue_);
-        level_node_lists.push_back(level_node_list_init_host.to_device(queue_));
-
-        last_event = compute_initial_histogram(ctx,
-                                               response_nd_,
-                                               tree_order_lev_,
-                                               level_node_lists[0],
-                                               imp_data_holder.get_mutable_data(0),
-                                               node_count,
-                                               { last_event });
+        auto gen_initial_tree_order_event = gen_initial_tree_order(ctx,
+                                                                   engine_arr,
+                                                                   level_node_list_init,
+                                                                   tree_order_lev_,
+                                                                   iter,
+                                                                   node_count,
+                                                                   { fill_event });
+
+        level_node_lists.push_back(level_node_list_init);
+
+        auto compute_initial_histogram_event =
+            compute_initial_histogram(ctx,
+                                      response_nd_,
+                                      tree_order_lev_,
+                                      level_node_lists[0],
+                                      imp_data_holder.get_mutable_data(0),
+                                      node_count,
+                                      { gen_initial_tree_order_event });
         last_event.wait_and_throw();
 
         if (ctx.oob_required_) {
@@ -2000,7 +2146,6 @@ train_result<Task> train_kernel_hist_impl<Float, Bin, Index, Task>::operator()(
             if (node_count_new) {
                 //there are split nodes -> next level is required
                 node_count_new *= 2;
-
                 de::check_mul_overflow(node_count_new, impl_const_t::node_prop_count_);
                 auto node_list_new = pr::ndarray<Index, 1>::empty(
                     queue_,
diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_misc_structs.hpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_misc_structs.hpp
index 96e4552ea1b..360c1ba1065 100644
--- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_misc_structs.hpp
+++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_misc_structs.hpp
@@ -119,7 +119,7 @@ struct train_context {
 
     Index selected_ftr_count_ = 0;
     Index selected_row_count_ = 0;
-    Index selected_row_total_count_ = 0;
+    std::int64_t selected_row_total_count_ = 0;
     Index min_observations_in_leaf_node_ = 0;
     Index max_tree_depth_ = 0;
 
diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_service_kernels_dpc.cpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_service_kernels_dpc.cpp
index 11549f3d62d..0de8dd1d6f6 100644
--- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_service_kernels_dpc.cpp
+++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_service_kernels_dpc.cpp
@@ -19,6 +19,7 @@
 #include "oneapi/dal/table/row_accessor.hpp"
 #include "oneapi/dal/detail/profiler.hpp"
 #include "oneapi/dal/algo/decision_forest/backend/gpu/train_helpers.hpp"
+#include <iostream>
 
 #ifdef ONEDAL_DATA_PARALLEL
 
diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_splitter_impl_dpc.cpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_splitter_impl_dpc.cpp
index eeaafe2a179..d7ad6137288 100644
--- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_splitter_impl_dpc.cpp
+++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_splitter_impl_dpc.cpp
@@ -19,7 +19,7 @@
 #include "oneapi/dal/table/row_accessor.hpp"
 #include "oneapi/dal/detail/profiler.hpp"
 #include "oneapi/dal/algo/decision_forest/backend/gpu/train_helpers.hpp"
-
+#include <iostream>
 #ifdef ONEDAL_DATA_PARALLEL
 
 #include "oneapi/dal/algo/decision_forest/backend/gpu/train_splitter_impl.hpp"
@@ -134,7 +134,6 @@ sycl::event train_splitter_impl<Float, Bin, Index, Task>::random_split(
 
     const auto nd_range =
         bk::make_multiple_nd_range_2d({ local_size, node_in_block_count }, { local_size, 1 });
-
     sycl::event last_event = queue.submit([&](sycl::handler& cgh) {
         cgh.depends_on(deps);
         local_accessor_rw_t<hist_type_t> local_hist_buf(hist_size, cgh);
@@ -537,7 +536,19 @@ sycl::event train_splitter_impl<Float, Bin, Index, Task>::best_split(
     const Index bin_block =
         compute_bin_block_size<hist_type_t, Index, Float, Task>(queue, hist_prop_count, bin_count);
 
-    const Index local_size = bk::device_max_wg_size(queue);
+    const Index local_size_initial = bk::device_max_wg_size(queue);
+    Index local_size = local_size_initial;
+    const auto max_int_limit = std::numeric_limits<int>::max();
+
+    if (node_count * ftr_count > 0 && node_count * ftr_count <= max_int_limit) {
+        while (node_count * ftr_count * local_size > max_int_limit) {
+            local_size /= 2;
+        }
+    }
+    else {
+        std::cerr << "Error: node_count * ftr_count exceeds int limit" << std::endl;
+    }
+
     const auto nd_range =
         bk::make_multiple_nd_range_3d({ node_count, ftr_count, local_size }, { 1, 1, local_size });
 
@@ -686,7 +697,6 @@ sycl::event train_splitter_impl<Float, Bin, Index, Task>::best_split(
             }
         });
     });
-
     // Merging kernel: selects best split among all features.
     const auto merge_range =
         bk::make_multiple_nd_range_2d({ node_count, local_size }, { 1, local_size });
diff --git a/cpp/oneapi/dal/algo/decision_forest/test/spmd.cpp b/cpp/oneapi/dal/algo/decision_forest/test/spmd.cpp
index 534acddb04a..fe99f8f8d4b 100644
--- a/cpp/oneapi/dal/algo/decision_forest/test/spmd.cpp
+++ b/cpp/oneapi/dal/algo/decision_forest/test/spmd.cpp
@@ -209,7 +209,7 @@ DF_SPMD_CLS_TEST_NIGHTLY_EXT("df cls default flow") {
     desc.set_max_tree_depth(max_tree_depth_val);
     desc.set_class_count(wl.ds_info.class_count);
 
-    this->set_rank_count(2);
+    this->set_rank_count(4);
     const auto train_result =
         this->train_spmd_base_checks(desc, data, this->get_homogen_table_id());
     const auto model = train_result.get_model();
@@ -233,7 +233,7 @@ DF_SPMD_CLS_TEST_EXT("df cls corner flow") {
     desc.set_min_observations_in_leaf_node(8);
     desc.set_class_count(wl.ds_info.class_count);
 
-    this->set_rank_count(2);
+    this->set_rank_count(4);
     const auto train_result =
         this->train_spmd_base_checks(desc, data, this->get_homogen_table_id());
     const auto model = train_result.get_model();
@@ -257,7 +257,7 @@ DF_SPMD_CLS_TEST_EXT("df cls small flow") {
     desc.set_tree_count(tree_count);
     desc.set_class_count(wl.ds_info.class_count);
 
-    this->set_rank_count(2);
+    this->set_rank_count(4);
     const auto train_result =
         this->train_spmd_base_checks(desc, data, this->get_homogen_table_id());
     const auto model = train_result.get_model();
@@ -288,7 +288,7 @@ DF_SPMD_CLS_TEST_NIGHTLY_EXT("df cls impurity flow") {
     desc.set_impurity_threshold(impurity_threshold_val);
     desc.set_class_count(wl.ds_info.class_count);
 
-    this->set_rank_count(2);
+    this->set_rank_count(4);
     const auto train_result =
         this->train_spmd_base_checks(desc, data, this->get_homogen_table_id());
     const auto model = train_result.get_model();
@@ -317,7 +317,7 @@ DF_SPMD_CLS_TEST_NIGHTLY_EXT("df cls all features flow") {
     desc.set_features_per_node(data.get_column_count() - 1); // skip responses column
     desc.set_class_count(wl.ds_info.class_count);
 
-    this->set_rank_count(2);
+    this->set_rank_count(4);
     const auto train_result =
         this->train_spmd_base_checks(desc, data, this->get_homogen_table_id());
     const auto model = train_result.get_model();
@@ -342,7 +342,7 @@ DF_SPMD_CLS_TEST_NIGHTLY_EXT("df cls bootstrap flow") {
     desc.set_max_tree_depth(50);
     desc.set_class_count(wl.ds_info.class_count);
 
-    this->set_rank_count(2);
+    this->set_rank_count(4);
     const auto train_result =
         this->train_spmd_base_checks(desc, data, this->get_homogen_table_id());
     const auto model = train_result.get_model();
@@ -372,7 +372,7 @@ DF_SPMD_CLS_TEST_NIGHTLY_EXT("df cls oob per observation flow") {
     desc.set_observations_per_tree_fraction(observations_per_tree_fraction_val);
     desc.set_class_count(wl.ds_info.class_count);
 
-    this->set_rank_count(2);
+    this->set_rank_count(4);
     const auto train_result =
         this->train_spmd_base_checks(desc, data, this->get_homogen_table_id());
     const auto model = train_result.get_model();
@@ -393,7 +393,7 @@ DF_SPMD_CLS_TEST("df cls base check with default params") {
 
     desc.set_class_count(class_count);
 
-    this->set_rank_count(2);
+    this->set_rank_count(4);
     const auto train_result =
         this->train_spmd_base_checks(desc, data, this->get_homogen_table_id());
     const auto model = train_result.get_model();
@@ -411,7 +411,7 @@ DF_SPMD_CLS_TEST("df cls base check with default params and train weights") {
 
     desc.set_class_count(class_count);
 
-    this->set_rank_count(2);
+    this->set_rank_count(4);
     const auto train_result =
         this->train_spmd_weighted_base_checks(desc, data, this->get_homogen_table_id());
     const auto model = train_result.get_model();
@@ -444,7 +444,7 @@ DF_SPMD_CLS_TEST("df cls base check with non default params") {
     desc.set_voting_mode(df::voting_mode::unweighted);
     desc.set_class_count(class_count);
 
-    this->set_rank_count(2);
+    this->set_rank_count(4);
     const auto train_result =
         this->train_spmd_base_checks(desc, data, this->get_homogen_table_id());
     const auto model = train_result.get_model();
@@ -462,7 +462,7 @@ DF_SPMD_REG_TEST("df reg base check with default params") {
 
     auto desc = this->get_default_descriptor();
 
-    this->set_rank_count(2);
+    this->set_rank_count(4);
     const auto train_result =
         this->train_spmd_base_checks(desc, data, this->get_homogen_table_id());
     const auto model = train_result.get_model();
@@ -478,7 +478,7 @@ DF_SPMD_REG_TEST("df reg base check with default params and train weights") {
 
     auto desc = this->get_default_descriptor();
 
-    this->set_rank_count(2);
+    this->set_rank_count(4);
     const auto train_result =
         this->train_spmd_weighted_base_checks(desc, data, this->get_homogen_table_id());
     const auto model = train_result.get_model();
@@ -505,7 +505,7 @@ DF_SPMD_REG_TEST("df reg base check with non default params") {
     desc.set_variable_importance_mode(variable_importance_mode_val);
     desc.set_error_metric_mode(error_metric_mode_val);
 
-    this->set_rank_count(2);
+    this->set_rank_count(4);
     const auto train_result =
         this->train_spmd_base_checks(desc, data, this->get_homogen_table_id());
     const auto model = train_result.get_model();
@@ -524,7 +524,7 @@ DF_SPMD_REG_TEST_NIGHTLY_EXT("df reg default flow") {
 
     auto desc = this->get_default_descriptor();
 
-    this->set_rank_count(2);
+    this->set_rank_count(4);
     const auto train_result =
         this->train_spmd_base_checks(desc, data, this->get_homogen_table_id());
     const auto model = train_result.get_model();
@@ -547,7 +547,7 @@ DF_SPMD_REG_TEST_EXT("df reg small flow") {
     desc.set_tree_count(tree_count);
     desc.set_min_observations_in_leaf_node(1);
 
-    this->set_rank_count(2);
+    this->set_rank_count(4);
     const auto train_result =
         this->train_spmd_base_checks(desc, data, this->get_homogen_table_id());
     const auto model = train_result.get_model();
@@ -572,7 +572,7 @@ DF_SPMD_REG_TEST_NIGHTLY_EXT("df reg impurity flow") {
     desc.set_min_observations_in_leaf_node(min_observations_in_leaf_node);
     desc.set_impurity_threshold(impurity_threshold_val);
 
-    this->set_rank_count(2);
+    this->set_rank_count(4);
     const auto train_result =
         this->train_spmd_base_checks(desc, data, this->get_homogen_table_id());
     const auto model = train_result.get_model();
@@ -599,7 +599,7 @@ DF_SPMD_REG_TEST_NIGHTLY_EXT("df reg bootstrap flow") {
     desc.set_max_tree_depth(max_tree_depth_val);
     desc.set_bootstrap(bootstrap_val);
 
-    this->set_rank_count(2);
+    this->set_rank_count(4);
     const auto train_result =
         this->train_spmd_base_checks(desc, data, this->get_homogen_table_id());
     const auto model = train_result.get_model();
diff --git a/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp b/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp
index d21de8c9627..ecd49784378 100644
--- a/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp
+++ b/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp
@@ -17,7 +17,7 @@
 #pragma once
 
 #include "oneapi/dal/backend/memory.hpp"
-#include "oneapi/dal/backend/primitives/rng/rng_engine.hpp"
+#include "oneapi/dal/backend/primitives/rng/rng.hpp"
 
 namespace oneapi::dal::preview::louvain::backend {
 using namespace oneapi::dal::preview::detail;
@@ -123,8 +123,7 @@ struct louvain_data {
     // Total link weight in the network
     value_type m;
 
-    engine eng;
-    rng<std::int32_t> rn_gen;
+    host_engine<engine_method::mt2203> eng;
 
     const std::int64_t vertex_count;
     const std::int64_t edge_count;
diff --git a/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp b/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp
index 79e294e9f47..e287c3f2f66 100644
--- a/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp
+++ b/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp
@@ -206,7 +206,7 @@ inline Float move_nodes(const dal::preview::detail::topology<IndexType>& t,
         ld.random_order[index] = index;
     }
     // random shuffle
-    ld.rn_gen.uniform(t._vertex_count, ld.index, ld.eng.get_state(), 0, t._vertex_count);
+    uniform<std::int32_t>(t._vertex_count, ld.index, ld.eng, 0, t._vertex_count);
     for (std::int64_t index = 0; index < t._vertex_count; ++index) {
         std::swap(ld.random_order[index], ld.random_order[ld.index[index]]);
     }
diff --git a/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp b/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp
index d22a2dde0a1..6cf2b73ccd6 100644
--- a/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp
+++ b/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp
@@ -25,7 +25,7 @@
 #include "oneapi/dal/table/csr_accessor.hpp"
 #include "oneapi/dal/detail/debug.hpp"
 
-#include "oneapi/dal/backend/primitives/rng/rng_engine.hpp"
+#include "oneapi/dal/backend/primitives/rng/rng.hpp"
 
 namespace oneapi::dal::backend::primitives::test {
 
@@ -572,13 +572,12 @@ class logloss_test : public te::float_algo_fixture<std::tuple_element_t<0, Param
         const std::int64_t p = hessian_host.get_dimension(0) - 1;
         const std::int64_t dim = fit_intercept ? p + 1 : p;
 
-        primitives::rng<float_t> rn_gen;
         auto vec_host =
             ndarray<float_t, 1>::empty(this->get_queue(), { dim }, sycl::usm::alloc::host);
 
         for (std::int32_t ij = 0; ij < num_checks; ++ij) {
-            primitives::engine eng(2007 + dim * num_checks + ij);
-            rn_gen.uniform(dim, vec_host.get_mutable_data(), eng.get_state(), -1.0, 1.0);
+            primitives::host_engine eng(2007 + dim * num_checks + ij);
+            pr::uniform<float_t>(dim, vec_host.get_mutable_data(), eng, -1.0, 1.0);
             auto vec_gpu = vec_host.to_device(this->get_queue());
             auto out_vector =
                 ndarray<float_t, 1>::empty(this->get_queue(), { dim }, sycl::usm::alloc::device);
diff --git a/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp b/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp
index e902dd452e1..63ab0a07c13 100644
--- a/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp
+++ b/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp
@@ -100,12 +100,12 @@ class logloss_spmd_test : public logloss_test<Param> {
         std::int64_t num_checks = 5;
 
         std::vector<ndarray<float_t, 1>> vecs_host(num_checks), vecs_gpu(num_checks);
-        rng<float_t> rn_gen;
+
         for (std::int64_t ij = 0; ij < num_checks; ++ij) {
-            engine eng(2007 + dim * num_checks + ij);
+            host_engine eng(2007 + dim * num_checks + ij);
             vecs_host[ij] =
                 (ndarray<float_t, 1>::empty(this->get_queue(), { dim }, sycl::usm::alloc::host));
-            rn_gen.uniform(dim, vecs_host[ij].get_mutable_data(), eng.get_state(), -1.0, 1.0);
+            uniform<float_t>(dim, vecs_host[ij].get_mutable_data(), eng, -1.0, 1.0);
             vecs_gpu[ij] = vecs_host[ij].to_device(this->get_queue());
         }
 
diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp
index ea320f690a2..b529836f70e 100644
--- a/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp
@@ -20,7 +20,7 @@
 #include "oneapi/dal/test/engine/common.hpp"
 #include "oneapi/dal/test/engine/fixtures.hpp"
 #include "oneapi/dal/table/row_accessor.hpp"
-#include "oneapi/dal/backend/primitives/rng/rng_engine.hpp"
+#include "oneapi/dal/backend/primitives/rng/rng.hpp"
 #include <math.h>
 
 namespace oneapi::dal::backend::primitives::test {
@@ -43,9 +43,8 @@ class cg_solver_test : public te::float_algo_fixture<Param> {
         x_host_ = ndarray<float_t, 1>::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host);
         b_host_ = ndarray<float_t, 1>::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host);
 
-        primitives::rng<float_t> rn_gen;
-        primitives::engine eng(4014 + n_);
-        rn_gen.uniform(n_, x_host_.get_mutable_data(), eng.get_state(), -1.0, 1.0);
+        primitives::host_engine eng(4014 + n_);
+        primitives::uniform<float_t>(n_, x_host_.get_mutable_data(), eng, -1.0, 1.0);
 
         create_stable_matrix(this->get_queue(), A_host_);
 
diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp
index a6b87b2dcc1..c188c50983c 100644
--- a/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp
+++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp
@@ -21,7 +21,7 @@
 #include "oneapi/dal/backend/primitives/ndarray.hpp"
 #include "oneapi/dal/test/engine/common.hpp"
 #include "oneapi/dal/test/engine/fixtures.hpp"
-#include "oneapi/dal/backend/primitives/rng/rng_engine.hpp"
+#include "oneapi/dal/backend/primitives/rng/rng.hpp"
 #include "oneapi/dal/backend/primitives/blas/gemv.hpp"
 #include "oneapi/dal/backend/primitives/element_wise.hpp"
 
@@ -133,11 +133,10 @@ void create_stable_matrix(sycl::queue& queue,
     ONEDAL_ASSERT(A.get_dimension(1) == n);
     auto J = ndarray<Float, 2>::empty(queue, { n, n }, sycl::usm::alloc::host);
     auto eigen_values = ndarray<Float, 1>::empty(queue, { n }, sycl::usm::alloc::host);
-    primitives::rng<Float> rn_gen;
-    primitives::engine eng(2007 + n);
+    primitives::host_engine eng(2007 + n);
 
-    rn_gen.uniform(n * n, J.get_mutable_data(), eng.get_state(), -1.0, 1.0);
-    rn_gen.uniform(n, eigen_values.get_mutable_data(), eng.get_state(), bottom_eig, top_eig);
+    primitives::uniform<Float>(n * n, J.get_mutable_data(), eng, -1.0, 1.0);
+    primitives::uniform<Float>(n, eigen_values.get_mutable_data(), eng, bottom_eig, top_eig);
 
     // orthogonalize matrix J
     gram_schmidt(J);
diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp
index f473dddf1f7..b2ebe9f5bdb 100644
--- a/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp
+++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp
@@ -22,7 +22,7 @@
 #include "oneapi/dal/test/engine/common.hpp"
 #include "oneapi/dal/test/engine/fixtures.hpp"
 #include "oneapi/dal/table/row_accessor.hpp"
-#include "oneapi/dal/backend/primitives/rng/rng_engine.hpp"
+#include "oneapi/dal/backend/primitives/rng/rng.hpp"
 #include <math.h>
 
 #include "oneapi/dal/backend/primitives/objective_function.hpp"
@@ -56,10 +56,10 @@ class newton_cg_test : public te::float_algo_fixture<Param> {
             ndarray<std::int32_t, 1>::empty(this->get_queue(), { n_ + 1 }, sycl::usm::alloc::host);
         auto params_host =
             ndarray<float_t, 1>::empty(this->get_queue(), { p_ + 1 }, sycl::usm::alloc::host);
-        primitives::rng<float_t> rn_gen;
-        primitives::engine eng(2007 + n);
-        rn_gen.uniform(n_ * p_, X_host.get_mutable_data(), eng.get_state(), -10.0, 10.0);
-        rn_gen.uniform(p_ + 1, params_host.get_mutable_data(), eng.get_state(), -5.0, 5.0);
+
+        primitives::host_engine eng(2007 + n);
+        primitives::uniform<float_t>(n_ * p_, X_host.get_mutable_data(), eng, -10.0, 10.0);
+        primitives::uniform<float_t>(p_ + 1, params_host.get_mutable_data(), eng, -5.0, 5.0);
         for (std::int64_t i = 0; i < n_; ++i) {
             float_t val = 0;
             for (std::int64_t j = 0; j < p_; ++j) {
@@ -142,9 +142,9 @@ class newton_cg_test : public te::float_algo_fixture<Param> {
             ndarray<float_t, 2>::empty(this->get_queue(), { n_, n_ }, sycl::usm::alloc::host);
         solution_ = ndarray<float_t, 1>::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host);
         auto b_host = ndarray<float_t, 1>::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host);
-        primitives::rng<float_t> rn_gen;
+
         primitives::engine eng(4014 + n_);
-        rn_gen.uniform(n_, solution_.get_mutable_data(), eng.get_state(), -1.0, 1.0);
+        uniform<float_t>(n_, solution_.get_mutable_data(), eng, -1.0, 1.0);
 
         create_stable_matrix(this->get_queue(), A_host, float_t(0.1), float_t(5.0));
 
@@ -164,7 +164,7 @@ class newton_cg_test : public te::float_algo_fixture<Param> {
         auto buffer = ndarray<float_t, 1>::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host);
 
         for (std::int32_t test_num = 0; test_num < 5; ++test_num) {
-            rn_gen.uniform(n_, x_host.get_mutable_data(), eng.get_state(), -1.0, 1.0);
+            uniform<float_t>(n_, x_host.get_mutable_data(), eng, -1.0, 1.0);
             auto x_gpu = x_host.to_device(this->get_queue());
             auto compute_event_vec = func_->update_x(x_gpu, true, {});
             wait_or_pass(compute_event_vec).wait_and_throw();
diff --git a/cpp/oneapi/dal/backend/primitives/rng/dpc_engine.hpp b/cpp/oneapi/dal/backend/primitives/rng/dpc_engine.hpp
new file mode 100644
index 00000000000..9b9745f4cfa
--- /dev/null
+++ b/cpp/oneapi/dal/backend/primitives/rng/dpc_engine.hpp
@@ -0,0 +1,153 @@
+/*******************************************************************************
+* Copyright contributors to the oneDAL project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#pragma once
+
+#include "oneapi/dal/backend/primitives/rng/utils.hpp"
+#include "oneapi/dal/backend/primitives/rng/rng_types.hpp"
+#include <oneapi/mkl.hpp>
+
+namespace mkl = oneapi::mkl;
+namespace oneapi::dal::backend::primitives {
+
+#ifdef ONEDAL_DATA_PARALLEL
+
+template <engine_method EngineType>
+struct dpc_engine_type;
+
+template <>
+struct dpc_engine_type<engine_method::mt2203> {
+    using type = oneapi::mkl::rng::mt2203;
+};
+
+template <>
+struct dpc_engine_type<engine_method::mcg59> {
+    using type = oneapi::mkl::rng::mcg59;
+};
+
+template <>
+struct dpc_engine_type<engine_method::mt19937> {
+    using type = oneapi::mkl::rng::mt19937;
+};
+
+template <>
+struct dpc_engine_type<engine_method::mrg32k3a> {
+    using type = oneapi::mkl::rng::mrg32k3a;
+};
+
+template <>
+struct dpc_engine_type<engine_method::philox4x32x10> {
+    using type = oneapi::mkl::rng::philox4x32x10;
+};
+
+/// A class that provides a unified interface for random number generation on both CPU and GPU devices.
+///
+/// This class serves as a wrapper for random number generators (RNGs) that supports different engine types,
+/// enabling efficient random number generation on heterogeneous platforms using SYCL. It integrates a host
+/// (CPU) engine and a device (GPU) engine, allowing operations to be executed seamlessly on the appropriate
+/// device.
+///
+/// @tparam EngineType The RNG engine type to be used. Defaults to `engine_method::mt2203`.
+///
+/// @param[in] queue The SYCL queue used to manage device operations.
+/// @param[in] seed  The initial seed for the random number generator. Defaults to `777`.
+///
+/// The class provides functionality to skip ahead in the RNG sequence, retrieve engine states, and
+/// manage host and device engines independently. Support for `skip_ahead` on GPU is currently limited for
+/// some engine types.
+template <engine_method EngineType = engine_method::mt2203>
+class dpc_engine {
+public:
+    using dpc_engine_t = typename dpc_engine_type<EngineType>::type;
+
+    explicit dpc_engine(sycl::queue& queue, std::int64_t seed = 777)
+            : q(queue),
+              host_engine_(initialize_host_engine(seed)),
+              dpc_engine_(initialize_dpc_engine(queue, seed)),
+              impl_(dynamic_cast<daal::algorithms::engines::internal::BatchBaseImpl*>(
+                  host_engine_.get())) {
+        if (!impl_) {
+            throw std::domain_error("RNG engine is not supported");
+        }
+    }
+
+    virtual ~dpc_engine() = default;
+
+    void* get_host_engine_state() const {
+        return impl_->getState();
+    }
+
+    auto& get_cpu_engine() {
+        return host_engine_;
+    }
+
+    auto& get_gpu_engine() {
+        return dpc_engine_;
+    }
+
+    void skip_ahead_cpu(size_t nSkip) {
+        host_engine_->skipAhead(nSkip);
+    }
+
+    void skip_ahead_gpu(size_t nSkip) {
+        // Will be supported in the next oneMKL release.
+        if constexpr (EngineType == engine_method::mt2203) {
+        }
+        else {
+            skip_ahead(dpc_engine_, nSkip);
+        }
+    }
+
+    sycl::queue& get_queue() {
+        return q;
+    }
+
+private:
+    daal::algorithms::engines::EnginePtr initialize_host_engine(std::int64_t seed) {
+        switch (EngineType) {
+            case engine_method::mt2203:
+                return daal::algorithms::engines::mt2203::Batch<>::create(seed);
+            case engine_method::mcg59:
+                return daal::algorithms::engines::mcg59::Batch<>::create(seed);
+            case engine_method::mrg32k3a:
+                return daal::algorithms::engines::mrg32k3a::Batch<>::create(seed);
+            case engine_method::philox4x32x10:
+                return daal::algorithms::engines::philox4x32x10::Batch<>::create(seed);
+            case engine_method::mt19937:
+                return daal::algorithms::engines::mt19937::Batch<>::create(seed);
+            default: throw std::invalid_argument("Unsupported engine type");
+        }
+    }
+
+    dpc_engine_t initialize_dpc_engine(sycl::queue& queue, std::int64_t seed) {
+        if constexpr (EngineType == engine_method::mt2203) {
+            return dpc_engine_t(
+                queue,
+                seed,
+                0); // Aligns CPU and GPU results for mt2203, impacts the performance.
+        }
+        else {
+            return dpc_engine_t(queue, seed);
+        }
+    }
+    sycl::queue q;
+    daal::algorithms::engines::EnginePtr host_engine_;
+    dpc_engine_t dpc_engine_;
+    daal::algorithms::engines::internal::BatchBaseImpl* impl_;
+};
+
+#endif
+} // namespace oneapi::dal::backend::primitives
diff --git a/cpp/oneapi/dal/backend/primitives/rng/host_engine.hpp b/cpp/oneapi/dal/backend/primitives/rng/host_engine.hpp
new file mode 100644
index 00000000000..c4b2c807674
--- /dev/null
+++ b/cpp/oneapi/dal/backend/primitives/rng/host_engine.hpp
@@ -0,0 +1,100 @@
+/*******************************************************************************
+* Copyright contributors to the oneDAL project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#pragma once
+
+#include "oneapi/dal/backend/primitives/rng/rng_types.hpp"
+#include "oneapi/dal/backend/primitives/rng/rng.hpp"
+#include "oneapi/dal/backend/primitives/rng/utils.hpp"
+
+#include <stdexcept>
+#include <type_traits>
+#include <utility>
+
+namespace oneapi::dal::backend::primitives {
+
+/// A class that provides an interface for random number generation on the host (CPU) only.
+///
+/// This class serves as a wrapper for host-based random number generators (RNGs), supporting multiple engine
+/// types for flexible and efficient random number generation on CPU. It abstracts the underlying engine
+/// implementation and provides an interface to manage and retrieve the engine's state.
+///
+/// @tparam EngineType The RNG engine type to be used. Defaults to `engine_method::mt2203`.
+///
+/// @param[in] seed  The initial seed for the random number generator. Defaults to `777`.
+///
+/// @note The class only supports host-based RNG and does not require a SYCL queue or device context.
+template <engine_method EngineType = engine_method::mt2203>
+class host_engine {
+public:
+    explicit host_engine(std::int64_t seed = 777)
+            : host_engine_(initialize_host_engine(seed)),
+              impl_(dynamic_cast<daal::algorithms::engines::internal::BatchBaseImpl*>(
+                  host_engine_.get())) {
+        if (!impl_) {
+            throw std::domain_error("RNG engine is not supported");
+        }
+    }
+
+    explicit host_engine(const daal::algorithms::engines::EnginePtr& eng) : host_engine_(eng) {
+        impl_ = dynamic_cast<daal::algorithms::engines::internal::BatchBaseImpl*>(eng.get());
+        if (!impl_) {
+            throw domain_error(dal::detail::error_messages::rng_engine_is_not_supported());
+        }
+    }
+
+    host_engine& operator=(const daal::algorithms::engines::EnginePtr& eng) {
+        host_engine_ = eng;
+        impl_ = dynamic_cast<daal::algorithms::engines::internal::BatchBaseImpl*>(eng.get());
+        if (!impl_) {
+            throw domain_error(dal::detail::error_messages::rng_engine_is_not_supported());
+        }
+
+        return *this;
+    }
+
+    virtual ~host_engine() = default;
+
+    void* get_host_engine_state() const {
+        return impl_->getState();
+    }
+
+    auto& get_host_engine() {
+        return host_engine_;
+    }
+
+private:
+    daal::algorithms::engines::EnginePtr initialize_host_engine(std::int64_t seed) {
+        switch (EngineType) {
+            case engine_method::mt2203:
+                return daal::algorithms::engines::mt2203::Batch<>::create(seed);
+            case engine_method::mcg59:
+                return daal::algorithms::engines::mcg59::Batch<>::create(seed);
+            case engine_method::mrg32k3a:
+                return daal::algorithms::engines::mrg32k3a::Batch<>::create(seed);
+            case engine_method::philox4x32x10:
+                return daal::algorithms::engines::philox4x32x10::Batch<>::create(seed);
+            case engine_method::mt19937:
+                return daal::algorithms::engines::mt19937::Batch<>::create(seed);
+            default: throw std::invalid_argument("Unsupported engine type");
+        }
+    }
+
+    daal::algorithms::engines::EnginePtr host_engine_;
+    daal::algorithms::engines::internal::BatchBaseImpl* impl_;
+};
+
+} // namespace oneapi::dal::backend::primitives
diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng.hpp
new file mode 100644
index 00000000000..83125ba73e7
--- /dev/null
+++ b/cpp/oneapi/dal/backend/primitives/rng/rng.hpp
@@ -0,0 +1,134 @@
+/*******************************************************************************
+* Copyright contributors to the oneDAL project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#pragma once
+
+#include "oneapi/dal/backend/primitives/rng/host_engine.hpp"
+
+#ifdef ONEDAL_DATA_PARALLEL
+
+#include "oneapi/dal/backend/primitives/rng/dpc_engine.hpp"
+
+#endif
+
+namespace oneapi::dal::backend::primitives {
+
+template <typename Type, typename Size, engine_method EngineType>
+void uniform(Size count, Type* dst, host_engine<EngineType>& host_engine, Type a, Type b) {
+    auto state = host_engine.get_host_engine_state();
+    uniform_dispatcher::uniform_by_cpu<Type>(count, dst, state, a, b);
+}
+
+template <typename Type, typename Size, engine_method EngineType>
+void uniform_without_replacement(Size count,
+                                 Type* dst,
+                                 Type* buffer,
+                                 host_engine<EngineType> host_engine,
+                                 Type a,
+                                 Type b) {
+    auto state = host_engine.get_host_engine_state();
+    uniform_dispatcher::uniform_without_replacement_by_cpu<Type>(count, dst, buffer, state, a, b);
+}
+
+template <typename Type,
+          typename Size,
+          engine_method EngineType,
+          typename T = Type,
+          typename = std::enable_if_t<std::is_integral_v<T>>>
+void shuffle(Size count, Type* dst, host_engine<EngineType> host_engine) {
+    auto state = host_engine.get_host_engine_state();
+    Type idx[2];
+    for (Size i = 0; i < count; ++i) {
+        uniform_dispatcher::uniform_by_cpu<Type>(2, idx, state, 0, count);
+        std::swap(dst[idx[0]], dst[idx[1]]);
+    }
+}
+
+#ifdef ONEDAL_DATA_PARALLEL
+template <typename Type, typename Size, engine_method EngineType>
+void uniform(Size count, Type* dst, dpc_engine<EngineType>& engine_, Type a, Type b) {
+    if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) ==
+        sycl::usm::alloc::device) {
+        throw domain_error(dal::detail::error_messages::unsupported_data_type());
+    }
+    auto state = engine_.get_host_engine_state();
+    uniform_dispatcher::uniform_by_cpu<Type>(count, dst, state, a, b);
+    engine_.skip_ahead_gpu(count);
+}
+
+template <typename Type, typename Size, engine_method EngineType>
+void uniform_without_replacement(Size count,
+                                 Type* dst,
+                                 Type* buffer,
+                                 dpc_engine<EngineType>& engine_,
+                                 Type a,
+                                 Type b) {
+    if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) ==
+        sycl::usm::alloc::device) {
+        throw domain_error(dal::detail::error_messages::unsupported_data_type());
+    }
+    void* state = engine_.get_host_engine_state();
+    uniform_dispatcher::uniform_without_replacement_by_cpu<Type>(count, dst, buffer, state, a, b);
+    engine_.skip_ahead_gpu(count);
+}
+
+template <typename Type,
+          typename Size,
+          engine_method EngineType,
+          typename T = Type,
+          typename = std::enable_if_t<std::is_integral_v<T>>>
+void shuffle(Size count, Type* dst, dpc_engine<EngineType>& engine_) {
+    if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) ==
+        sycl::usm::alloc::device) {
+        throw domain_error(dal::detail::error_messages::unsupported_data_type());
+    }
+    Type idx[2];
+    void* state = engine_.get_host_engine_state();
+    for (Size i = 0; i < count; ++i) {
+        uniform_dispatcher::uniform_by_cpu<Type>(2, idx, state, 0, count);
+        std::swap(dst[idx[0]], dst[idx[1]]);
+    }
+    engine_.skip_ahead_gpu(count);
+}
+
+template <typename Type, typename Size, engine_method EngineType>
+void uniform(sycl::queue& queue,
+             Size count,
+             Type* dst,
+             dpc_engine<EngineType>& engine_,
+             Type a,
+             Type b,
+             const event_vector& deps = {});
+
+template <typename Type, typename Size, engine_method EngineType>
+void uniform_without_replacement(sycl::queue& queue,
+                                 Size count,
+                                 Type* dst,
+                                 Type* buffer,
+                                 dpc_engine<EngineType>& engine_,
+                                 Type a,
+                                 Type b,
+                                 const event_vector& deps = {});
+
+template <typename Type, typename Size, engine_method EngineType>
+void shuffle(sycl::queue& queue,
+             Size count,
+             Type* dst,
+             dpc_engine<EngineType>& engine_,
+             const event_vector& deps = {});
+#endif
+
+}; // namespace oneapi::dal::backend::primitives
diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp
new file mode 100644
index 00000000000..4ad09c4cc99
--- /dev/null
+++ b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp
@@ -0,0 +1,186 @@
+/*******************************************************************************
+* Copyright contributors to the oneDAL project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include <oneapi/mkl.hpp>
+#include "oneapi/dal/backend/primitives/rng/rng.hpp"
+#include "oneapi/dal/backend/primitives/ndarray.hpp"
+
+namespace oneapi::dal::backend::primitives {
+
+namespace bk = oneapi::dal::backend;
+
+template <typename Type, typename Size, engine_method EngineType>
+void uniform(sycl::queue& queue,
+             Size count,
+             Type* dst,
+             dpc_engine<EngineType>& engine_,
+             Type a,
+             Type b,
+             const event_vector& deps) {
+    if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) == sycl::usm::alloc::host) {
+        throw domain_error(dal::detail::error_messages::unsupported_data_type());
+    }
+    oneapi::mkl::rng::uniform<Type> distr(a, b);
+    auto event = oneapi::mkl::rng::generate(distr, engine_.get_gpu_engine(), count, dst, { deps });
+    event.wait_and_throw();
+    engine_.skip_ahead_cpu(count);
+}
+
+//Currently only CPU impl
+template <typename Type, typename Size, engine_method EngineType>
+void uniform_without_replacement(sycl::queue& queue,
+                                 Size count,
+                                 Type* dst,
+                                 Type* buffer,
+                                 dpc_engine<EngineType>& engine_,
+                                 Type a,
+                                 Type b,
+                                 const event_vector& deps) {
+    if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) ==
+        sycl::usm::alloc::device) {
+        throw domain_error(dal::detail::error_messages::unsupported_data_type());
+    }
+    void* state = engine_.get_host_engine_state();
+    engine_.skip_ahead_gpu(count);
+    uniform_dispatcher::uniform_without_replacement_by_cpu<Type>(count, dst, buffer, state, a, b);
+}
+
+//Currently only CPU impl
+template <typename Type, typename Size, engine_method EngineType>
+void shuffle(sycl::queue& queue,
+             Size count,
+             Type* dst,
+             dpc_engine<EngineType>& engine_,
+             const event_vector& deps) {
+    Type idx[2];
+    if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) ==
+        sycl::usm::alloc::device) {
+        throw domain_error(dal::detail::error_messages::unsupported_data_type());
+    }
+    void* state = engine_.get_host_engine_state();
+    engine_.skip_ahead_gpu(count);
+
+    for (Size i = 0; i < count; ++i) {
+        uniform_dispatcher::uniform_by_cpu<Type>(2, idx, state, 0, count);
+        std::swap(dst[idx[0]], dst[idx[1]]);
+    }
+}
+
+#define INSTANTIATE_(F, Size, EngineType)                                \
+    template ONEDAL_EXPORT void uniform(sycl::queue& queue,              \
+                                        Size count_,                     \
+                                        F* dst,                          \
+                                        dpc_engine<EngineType>& engine_, \
+                                        F a,                             \
+                                        F b,                             \
+                                        const event_vector& deps);
+
+#define INSTANTIATE_FLOAT_(Size)                                   \
+    INSTANTIATE_(float, Size, engine_method::mt2203)               \
+    INSTANTIATE_(float, Size, engine_method::mcg59)                \
+    INSTANTIATE_(float, Size, engine_method::mrg32k3a)             \
+    INSTANTIATE_(float, Size, engine_method::philox4x32x10)        \
+    INSTANTIATE_(float, Size, engine_method::mt19937)              \
+    INSTANTIATE_(double, Size, engine_method::mt2203)              \
+    INSTANTIATE_(double, Size, engine_method::mcg59)               \
+    INSTANTIATE_(double, Size, engine_method::mrg32k3a)            \
+    INSTANTIATE_(double, Size, engine_method::philox4x32x10)       \
+    INSTANTIATE_(double, Size, engine_method::mt19937)             \
+    INSTANTIATE_(std::int32_t, Size, engine_method::mt2203)        \
+    INSTANTIATE_(std::int32_t, Size, engine_method::mcg59)         \
+    INSTANTIATE_(std::int32_t, Size, engine_method::mrg32k3a)      \
+    INSTANTIATE_(std::int32_t, Size, engine_method::philox4x32x10) \
+    INSTANTIATE_(std::int32_t, Size, engine_method::mt19937)
+INSTANTIATE_FLOAT_(std::int64_t);
+INSTANTIATE_FLOAT_(std::int32_t);
+
+#define INSTANTIATE_uniform_without_replacement(F, Size, EngineType)                         \
+    template ONEDAL_EXPORT void uniform_without_replacement(sycl::queue& queue,              \
+                                                            Size count_,                     \
+                                                            F* dst,                          \
+                                                            F* buff,                         \
+                                                            dpc_engine<EngineType>& engine_, \
+                                                            F a,                             \
+                                                            F b,                             \
+                                                            const event_vector& deps);
+
+#define INSTANTIATE_uniform_without_replacement_FLOAT(Size)                                        \
+    INSTANTIATE_uniform_without_replacement(float, Size, engine_method::mt2203)                    \
+        INSTANTIATE_uniform_without_replacement(                                                   \
+            float,                                                                                 \
+            Size,                                                                                  \
+            engine_method::mcg59) INSTANTIATE_uniform_without_replacement(float,                   \
+                                                                          Size,                    \
+                                                                          engine_method::mrg32k3a) \
+            INSTANTIATE_uniform_without_replacement(float, Size, engine_method::philox4x32x10)     \
+                INSTANTIATE_uniform_without_replacement(float, Size, engine_method::mt19937)       \
+                    INSTANTIATE_uniform_without_replacement(double, Size, engine_method::mt2203)   \
+                        INSTANTIATE_uniform_without_replacement(double,                            \
+                                                                Size,                              \
+                                                                engine_method::mcg59)              \
+                            INSTANTIATE_uniform_without_replacement(double,                        \
+                                                                    Size,                          \
+                                                                    engine_method::mrg32k3a)       \
+                                INSTANTIATE_uniform_without_replacement(                           \
+                                    double,                                                        \
+                                    Size,                                                          \
+                                    engine_method::philox4x32x10)                                  \
+                                    INSTANTIATE_uniform_without_replacement(                       \
+                                        double,                                                    \
+                                        Size,                                                      \
+                                        engine_method::mt19937)                                    \
+                                        INSTANTIATE_uniform_without_replacement(                   \
+                                            std::int32_t,                                          \
+                                            Size,                                                  \
+                                            engine_method::mt2203)                                 \
+                                            INSTANTIATE_uniform_without_replacement(               \
+                                                std::int32_t,                                      \
+                                                Size,                                              \
+                                                engine_method::mcg59)                              \
+                                                INSTANTIATE_uniform_without_replacement(           \
+                                                    std::int32_t,                                  \
+                                                    Size,                                          \
+                                                    engine_method::mrg32k3a)                       \
+                                                    INSTANTIATE_uniform_without_replacement(       \
+                                                        std::int32_t,                              \
+                                                        Size,                                      \
+                                                        engine_method::philox4x32x10)              \
+                                                        INSTANTIATE_uniform_without_replacement(   \
+                                                            std::int32_t,                          \
+                                                            Size,                                  \
+                                                            engine_method::mt19937)
+
+INSTANTIATE_uniform_without_replacement_FLOAT(std::int64_t);
+INSTANTIATE_uniform_without_replacement_FLOAT(std::int32_t);
+
+#define INSTANTIATE_SHUFFLE(F, Size, EngineType)                         \
+    template ONEDAL_EXPORT void shuffle(sycl::queue& queue,              \
+                                        Size count_,                     \
+                                        F* dst,                          \
+                                        dpc_engine<EngineType>& engine_, \
+                                        const event_vector& deps);
+
+#define INSTANTIATE_SHUFFLE_FLOAT(Size)                                   \
+    INSTANTIATE_SHUFFLE(std::int32_t, Size, engine_method::mt2203)        \
+    INSTANTIATE_SHUFFLE(std::int32_t, Size, engine_method::mcg59)         \
+    INSTANTIATE_SHUFFLE(std::int32_t, Size, engine_method::mrg32k3a)      \
+    INSTANTIATE_SHUFFLE(std::int32_t, Size, engine_method::philox4x32x10) \
+    INSTANTIATE_SHUFFLE(std::int32_t, Size, engine_method::mt19937)
+
+INSTANTIATE_SHUFFLE_FLOAT(std::int64_t);
+INSTANTIATE_SHUFFLE_FLOAT(std::int32_t);
+
+} // namespace oneapi::dal::backend::primitives
diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_engine.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_engine.hpp
deleted file mode 100644
index c8ca3b13ce9..00000000000
--- a/cpp/oneapi/dal/backend/primitives/rng/rng_engine.hpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/*******************************************************************************
-* Copyright 2021 Intel Corporation
-*
-* Licensed under the Apache License, Version 2.0 (the "License");
-* you may not use this file except in compliance with the License.
-* You may obtain a copy of the License at
-*
-*     http://www.apache.org/licenses/LICENSE-2.0
-*
-* Unless required by applicable law or agreed to in writing, software
-* distributed under the License is distributed on an "AS IS" BASIS,
-* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-* See the License for the specific language governing permissions and
-* limitations under the License.
-*******************************************************************************/
-
-#pragma once
-
-#include <daal/include/algorithms/engines/mt2203/mt2203.h>
-
-#include "oneapi/dal/backend/primitives/rng/utils.hpp"
-
-namespace oneapi::dal::backend::primitives {
-
-template <typename Type, typename Size = std::int64_t>
-class rng {
-public:
-    rng() = default;
-    ~rng() = default;
-
-    void uniform(Size count, Type* dst, void* state, Type a, Type b) {
-        uniform_dispatcher::uniform_by_cpu<Type>(count, dst, state, a, b);
-    }
-
-    void uniform_without_replacement(Size count,
-                                     Type* dst,
-                                     Type* buffer,
-                                     void* state,
-                                     Type a,
-                                     Type b) {
-        uniform_dispatcher::uniform_without_replacement_by_cpu<Type>(count,
-                                                                     dst,
-                                                                     buffer,
-                                                                     state,
-                                                                     a,
-                                                                     b);
-    }
-
-    template <typename T = Type, typename = std::enable_if_t<std::is_integral_v<T>>>
-    void shuffle(Size count, Type* dst, void* state) {
-        Type idx[2];
-
-        for (Size i = 0; i < count; ++i) {
-            uniform_dispatcher::uniform_by_cpu<Type>(2, idx, state, 0, count);
-            std::swap(dst[idx[0]], dst[idx[1]]);
-        }
-    }
-
-private:
-    daal::internal::RNGsInst<Type, DAAL_BASE_CPU> daal_rng_;
-};
-
-class engine {
-public:
-    explicit engine(std::int64_t seed = 777)
-            : engine_(daal::algorithms::engines::mt2203::Batch<>::create(seed)) {
-        impl_ = dynamic_cast<daal::algorithms::engines::internal::BatchBaseImpl*>(engine_.get());
-        if (!impl_) {
-            throw domain_error(dal::detail::error_messages::rng_engine_is_not_supported());
-        }
-    }
-
-    explicit engine(const daal::algorithms::engines::EnginePtr& eng) : engine_(eng) {
-        impl_ = dynamic_cast<daal::algorithms::engines::internal::BatchBaseImpl*>(eng.get());
-        if (!impl_) {
-            throw domain_error(dal::detail::error_messages::rng_engine_is_not_supported());
-        }
-    }
-
-    virtual ~engine() = default;
-
-    engine& operator=(const daal::algorithms::engines::EnginePtr& eng) {
-        engine_ = eng;
-        impl_ = dynamic_cast<daal::algorithms::engines::internal::BatchBaseImpl*>(eng.get());
-        if (!impl_) {
-            throw domain_error(dal::detail::error_messages::rng_engine_is_not_supported());
-        }
-
-        return *this;
-    }
-
-    void* get_state() const {
-        return impl_->getState();
-    }
-
-private:
-    daal::algorithms::engines::EnginePtr engine_;
-    daal::algorithms::engines::internal::BatchBaseImpl* impl_;
-};
-
-} // namespace oneapi::dal::backend::primitives
diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp
index 09a5a589141..e7e19f64c4d 100644
--- a/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp
+++ b/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp
@@ -16,12 +16,18 @@
 
 #pragma once
 
-#include "oneapi/dal/backend/primitives/rng/rng_engine.hpp"
-
+#include "oneapi/dal/backend/primitives/rng/rng.hpp"
+#include "oneapi/dal/backend/primitives/ndarray.hpp"
 #include <vector>
 
+#include "oneapi/dal/backend/primitives/rng/utils.hpp"
+#include "oneapi/dal/backend/primitives/rng/rng_types.hpp"
+#include "oneapi/dal/table/common.hpp"
+
 namespace oneapi::dal::backend::primitives {
 
+#ifdef ONEDAL_DATA_PARALLEL
+
 template <typename Size = std::int64_t>
 class engine_collection {
 public:
@@ -30,10 +36,10 @@ class engine_collection {
               engine_(daal::algorithms::engines::mt2203::Batch<>::create(seed)),
               params_(count),
               technique_(daal::algorithms::engines::internal::family),
-              daal_engine_list_(count) {}
+              host_engine_method_(count) {}
 
     template <typename Op>
-    std::vector<engine> operator()(Op&& op) {
+    std::vector<host_engine<engine_method::mt2203>> operator()(Op&& op) {
         daal::services::Status status;
         for (Size i = 0; i < count_; ++i) {
             op(i, params_.nSkip[i]);
@@ -43,25 +49,25 @@ class engine_collection {
             engine_,
             technique_,
             params_,
-            daal_engine_list_,
+            host_engine_method_,
             &status);
         if (!status) {
             dal::backend::interop::status_to_exception(status);
         }
 
-        std::vector<engine> engine_list(count_);
+        std::vector<host_engine<engine_method::mt2203>> engine_method(count_);
         for (Size i = 0; i < count_; ++i) {
-            engine_list[i] = daal_engine_list_[i];
+            engine_method[i] = host_engine_method_[i];
         }
 
         //copy elision
-        return engine_list;
+        return engine_method;
     }
 
 private:
     void select_parallelization_technique(
         daal::algorithms::engines::internal::ParallelizationTechnique& technique) {
-        auto daal_engine_impl =
+        auto host_engine_impl =
             dynamic_cast<daal::algorithms::engines::internal::BatchBaseImpl*>(engine_.get());
 
         daal::algorithms::engines::internal::ParallelizationTechnique techniques[] = {
@@ -71,7 +77,7 @@ class engine_collection {
         };
 
         for (auto& techn : techniques) {
-            if (daal_engine_impl->hasSupport(techn)) {
+            if (host_engine_impl->hasSupport(techn)) {
                 technique = techn;
                 return;
             }
@@ -87,7 +93,30 @@ class engine_collection {
     daal::algorithms::engines::internal::Params<daal::sse2> params_;
     daal::algorithms::engines::internal::ParallelizationTechnique technique_;
     daal::services::internal::TArray<daal::algorithms::engines::EnginePtr, daal::sse2>
-        daal_engine_list_;
+        host_engine_method_;
+};
+
+template <typename Size = std::int64_t, engine_method EngineType = engine_method::mt2203>
+class engine_collection_oneapi {
+public:
+    engine_collection_oneapi(sycl::queue& queue, Size count, std::int64_t seed = 777)
+            : count_(count),
+              seed_(seed) {
+        engines_.reserve(count_);
+        for (Size i = 0; i < count_; ++i) {
+            engines_.push_back(dpc_engine<EngineType>(queue, seed_));
+        }
+    }
+
+    std::vector<dpc_engine<EngineType>> get_engines() const {
+        return engines_;
+    }
+
+private:
+    Size count_;
+    std::int64_t seed_;
+    std::vector<dpc_engine<EngineType>> engines_;
 };
 
+#endif
 } // namespace oneapi::dal::backend::primitives
diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_types.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_types.hpp
new file mode 100644
index 00000000000..4132fbe557a
--- /dev/null
+++ b/cpp/oneapi/dal/backend/primitives/rng/rng_types.hpp
@@ -0,0 +1,29 @@
+/*******************************************************************************
+* Copyright contributors to the oneDAL project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#pragma once
+
+#include <daal/include/algorithms/engines/mt2203/mt2203.h>
+#include <daal/include/algorithms/engines/mcg59/mcg59.h>
+#include <daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h>
+#include <daal/include/algorithms/engines/philox4x32x10/philox4x32x10.h>
+#include <daal/include/algorithms/engines/mt19937/mt19937.h>
+
+namespace oneapi::dal::backend::primitives {
+
+enum class engine_method { mt2203, mcg59, mt19937, mrg32k3a, philox4x32x10 };
+
+}
diff --git a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp
new file mode 100644
index 00000000000..2a079f15466
--- /dev/null
+++ b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp
@@ -0,0 +1,237 @@
+/*******************************************************************************
+* Copyright contributors to the oneDAL project
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "oneapi/dal/test/engine/common.hpp"
+#include "oneapi/dal/test/engine/fixtures.hpp"
+#include "oneapi/dal/test/engine/dataframe.hpp"
+
+#include "oneapi/dal/backend/primitives/rng/rng.hpp"
+#include "oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp"
+namespace oneapi::dal::backend::primitives::test {
+
+namespace te = dal::test::engine;
+
+class mt2203 {};
+class mcg59 {};
+class mrg32k3a {};
+class mt19937 {};
+class philox4x32x10 {};
+
+template <typename engine_type>
+struct engine_map {};
+
+template <>
+struct engine_map<mt2203> {
+    constexpr static auto value = engine_method::mt2203;
+};
+
+template <>
+struct engine_map<mcg59> {
+    constexpr static auto value = engine_method::mcg59;
+};
+
+template <>
+struct engine_map<mrg32k3a> {
+    constexpr static auto value = engine_method::mrg32k3a;
+};
+
+template <>
+struct engine_map<philox4x32x10> {
+    constexpr static auto value = engine_method::philox4x32x10;
+};
+
+template <>
+struct engine_map<mt19937> {
+    constexpr static auto value = engine_method::mt19937;
+};
+
+template <typename engine_type>
+constexpr auto engine_v = engine_map<engine_type>::value;
+
+template <typename TestType>
+class rng_test : public te::policy_fixture {
+public:
+    using DataType = std::tuple_element_t<0, TestType>;
+    using EngineType = std::tuple_element_t<1, TestType>;
+    static constexpr auto engine_test_type = engine_v<EngineType>;
+
+    auto get_host_engine(std::int64_t seed) {
+        auto rng_engine = host_engine<engine_test_type>(seed);
+        return rng_engine;
+    }
+
+    auto get_dpc_engine(std::int64_t seed) {
+        auto rng_engine = dpc_engine<engine_test_type>(this->get_queue(), seed);
+        return rng_engine;
+    }
+
+    auto allocate_array_host(std::int64_t elem_count) {
+        auto arr_host = ndarray<DataType, 1>::empty({ elem_count });
+        return arr_host;
+    }
+
+    auto allocate_array_device(std::int64_t elem_count) {
+        auto& q = this->get_queue();
+        auto arr_gpu = ndarray<DataType, 1>::empty(q, { elem_count }, sycl::usm::alloc::device);
+        return arr_gpu;
+    }
+
+    void check_results(const ndarray<DataType, 1>& arr_1, const ndarray<DataType, 1>& arr_2) {
+        const auto arr_1_host = arr_1.to_host(this->get_queue());
+        const DataType* val_arr_1_host_ptr = arr_1_host.get_data();
+
+        const auto arr_2_host = arr_2.to_host(this->get_queue());
+        const DataType* val_arr_2_host_ptr = arr_2_host.get_data();
+
+        for (std::int64_t el = 0; el < arr_2_host.get_count(); el++) {
+            // Due to MKL inside generates floats on GPU and doubles on CPU, it makes sense to add minor eps.
+            REQUIRE(abs(val_arr_1_host_ptr[el] - val_arr_2_host_ptr[el]) < 0.1);
+        }
+    }
+};
+
+using rng_types = COMBINE_TYPES((float, double), (mt2203, mt19937, mcg59, mrg32k3a, philox4x32x10));
+
+TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types) {
+    SKIP_IF(this->get_policy().is_cpu());
+    using Float = std::tuple_element_t<0, TestType>;
+
+    std::int64_t elem_count = GENERATE_COPY(10, 777, 10000, 50000);
+    std::int64_t seed = GENERATE_COPY(777, 999);
+
+    auto arr_gpu = this->allocate_array_device(elem_count);
+    auto arr_host = this->allocate_array_host(elem_count);
+    auto arr_gpu_ptr = arr_gpu.get_mutable_data();
+    auto arr_host_ptr = arr_host.get_mutable_data();
+
+    auto rng_engine = this->get_dpc_engine(seed);
+    auto rng_engine_ = this->get_dpc_engine(seed);
+
+    uniform<Float>(elem_count, arr_host_ptr, rng_engine, 0, elem_count);
+    uniform<Float>(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine_, 0, elem_count);
+
+    this->check_results(arr_gpu, arr_host);
+}
+
+using rng_types_skip_ahead_support = COMBINE_TYPES((float, double),
+                                                   (mt19937, mcg59, mrg32k3a, philox4x32x10));
+
+TEMPLATE_LIST_TEST_M(rng_test, "mixed rng cpu skip", "[rng]", rng_types_skip_ahead_support) {
+    SKIP_IF(this->get_policy().is_cpu());
+    using Float = std::tuple_element_t<0, TestType>;
+
+    std::int64_t elem_count = GENERATE_COPY(10, 777, 10000, 100000);
+    std::int64_t seed = GENERATE_COPY(777, 999);
+
+    auto arr_host_init_1 = this->allocate_array_host(elem_count);
+    auto arr_host_init_2 = this->allocate_array_host(elem_count);
+
+    auto arr_gpu = this->allocate_array_device(elem_count);
+    auto arr_host = this->allocate_array_host(elem_count);
+
+    auto arr_host_init_1_ptr = arr_host_init_1.get_mutable_data();
+    auto arr_host_init_2_ptr = arr_host_init_2.get_mutable_data();
+    auto arr_gpu_ptr = arr_gpu.get_mutable_data();
+    auto arr_host_ptr = arr_host.get_mutable_data();
+
+    auto rng_engine = this->get_dpc_engine(seed);
+    auto rng_engine_2 = this->get_dpc_engine(seed);
+
+    uniform<Float>(elem_count, arr_host_init_1_ptr, rng_engine, 0, elem_count);
+    uniform<Float>(elem_count, arr_host_init_2_ptr, rng_engine_2, 0, elem_count);
+
+    uniform<Float>(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine, 0, elem_count);
+    uniform<Float>(elem_count, arr_host_ptr, rng_engine_2, 0, elem_count);
+
+    this->check_results(arr_host_init_1, arr_host_init_2);
+    this->check_results(arr_gpu, arr_host);
+}
+
+TEMPLATE_LIST_TEST_M(rng_test, "mixed rng gpu skip", "[rng]", rng_types_skip_ahead_support) {
+    SKIP_IF(this->get_policy().is_cpu());
+    using Float = std::tuple_element_t<0, TestType>;
+
+    std::int64_t elem_count = GENERATE_COPY(10, 100, 777, 10000);
+    std::int64_t seed = GENERATE_COPY(1, 777, 999);
+
+    auto arr_device_init_1 = this->allocate_array_device(elem_count);
+    auto arr_device_init_2 = this->allocate_array_device(elem_count);
+
+    auto arr_gpu = this->allocate_array_device(elem_count);
+    auto arr_host = this->allocate_array_host(elem_count);
+
+    auto arr_device_init_1_ptr = arr_device_init_1.get_mutable_data();
+    auto arr_device_init_2_ptr = arr_device_init_2.get_mutable_data();
+    auto arr_gpu_ptr = arr_gpu.get_mutable_data();
+    auto arr_host_ptr = arr_host.get_mutable_data();
+
+    auto rng_engine = this->get_dpc_engine(seed);
+    auto rng_engine_2 = this->get_dpc_engine(seed);
+
+    uniform<Float>(this->get_queue(), elem_count, arr_device_init_1_ptr, rng_engine, 0, elem_count);
+    uniform<Float>(this->get_queue(),
+                   elem_count,
+                   arr_device_init_2_ptr,
+                   rng_engine_2,
+                   0,
+                   elem_count);
+
+    uniform<Float>(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine, 0, elem_count);
+    uniform<Float>(elem_count, arr_host_ptr, rng_engine_2, 0, elem_count);
+
+    this->check_results(arr_device_init_1, arr_device_init_2);
+    this->check_results(arr_gpu, arr_host);
+}
+
+//TODO: add engine collection test + host_engine tests
+// TEMPLATE_LIST_TEST_M(rng_test, "mixed rng gpu skip collection", "[rng]", rng_types_skip) {
+//     SKIP_IF(this->get_policy().is_cpu());
+//     std::int64_t elem_count = GENERATE_COPY(10, 100, 777, 10000);
+//     std::int64_t seed = GENERATE_COPY(1, 777, 999);
+
+//     engine_collection<std::int64_t,engine_method::mcg59> collection(this->get_queue(), 2, seed);
+
+//     auto engine_arr = collection.get_dpc_engines();
+
+//     auto [arr_device_init_1, arr_device_init_2] = this->allocate_arrays_shared(elem_count);
+
+//     auto arr_device_init_1_ptr = arr_device_init_1.get_mutable_data();
+//     auto arr_device_init_2_ptr = arr_device_init_2.get_mutable_data();
+
+//     auto rn_gen = this->get_rng();
+
+//     rn_gen.uniform(this->get_queue(),
+//                    elem_count,
+//                    arr_device_init_1_ptr,
+//                    engine_arr[0],
+//                    0,
+//                    elem_count);
+
+//     rn_gen.uniform(this->get_queue(),
+//                    elem_count,
+//                    arr_device_init_2_ptr,
+//                    engine_arr[1],
+//                    0,
+//                    elem_count);
+
+//     // rn_gen.uniform(this->get_queue(), elem_count, arr_gpu_ptr, engine_arr[0], 0, elem_count);
+//     // rn_gen.uniform(elem_count, arr_host_ptr, engine_arr[1], 0, elem_count);
+
+//     //this->check_results_device(arr_device_init_1, arr_device_init_2);
+//     this->check_results(arr_device_init_1, arr_device_init_2);
+// }
+
+} // namespace oneapi::dal::backend::primitives::test
diff --git a/cpp/oneapi/dal/detail/profiler.cpp b/cpp/oneapi/dal/detail/profiler.cpp
index 4fbb21aec49..48215741415 100644
--- a/cpp/oneapi/dal/detail/profiler.cpp
+++ b/cpp/oneapi/dal/detail/profiler.cpp
@@ -15,27 +15,101 @@
 *******************************************************************************/
 
 #include "oneapi/dal/detail/profiler.hpp"
+#include <iostream>
 
 namespace oneapi::dal::detail {
+
+profiler::profiler() {
+    start_time = get_time();
+}
+
+profiler::~profiler() {
+    auto end_time = get_time();
+    auto total_time = end_time - start_time;
+    std::cerr << "KERNEL_PROFILER: total time " << total_time / 1e6 << std::endl;
+}
+
+std::uint64_t profiler::get_time() {
+    struct timespec t;
+    clock_gettime(CLOCK_MONOTONIC, &t);
+    return t.tv_sec * 1000000000 + t.tv_nsec;
+}
+
+profiler* profiler::get_instance() {
+    static profiler instance;
+    return &instance;
+}
+
+task& profiler::get_task() {
+    return task_;
+}
+
+#ifdef ONEDAL_DATA_PARALLEL
+sycl::queue& profiler::get_queue() {
+    return queue_;
+}
+
+void profiler::set_queue(const sycl::queue& q) {
+    queue_ = q;
+}
+#endif
+
 profiler_task profiler::start_task(const char* task_name) {
+    auto ns_start = get_time();
+    auto& tasks_info = get_instance()->get_task();
+    tasks_info.time_kernels[tasks_info.current_kernel] = ns_start;
+    tasks_info.current_kernel++;
     return profiler_task(task_name);
 }
 
-void profiler::end_task(const char* task_name) {}
+void profiler::end_task(const char* task_name) {
+    const std::uint64_t ns_end = get_time();
+    auto& tasks_info = get_instance()->get_task();
+#ifdef ONEDAL_DATA_PARALLEL
+    auto& queue = get_instance()->get_queue();
+    queue.wait_and_throw();
+#endif
+    tasks_info.current_kernel--;
+    const std::uint64_t times = ns_end - tasks_info.time_kernels[tasks_info.current_kernel];
 
-profiler_task::profiler_task(const char* task_name) : task_name_(task_name) {}
+    auto it = tasks_info.kernels.find(task_name);
+    if (it == tasks_info.kernels.end()) {
+        tasks_info.kernels.insert({ task_name, times });
+    }
+    else {
+        it->second += times;
+    }
+    std::cerr << "KERNEL_PROFILER: " << std::string(task_name) << " " << times / 1e6 << std::endl;
+}
 
 #ifdef ONEDAL_DATA_PARALLEL
-profiler_task profiler::start_task(const char* task_name, const sycl::queue& task_queue) {
+profiler_task profiler::start_task(const char* task_name, sycl::queue& task_queue) {
+    task_queue.wait_and_throw();
+    get_instance()->set_queue(task_queue);
+    auto ns_start = get_time();
+    auto& tasks_info = get_instance()->get_task();
+    tasks_info.time_kernels[tasks_info.current_kernel] = ns_start;
+    tasks_info.current_kernel++;
     return profiler_task(task_name, task_queue);
 }
 
+
+
 profiler_task::profiler_task(const char* task_name, const sycl::queue& task_queue)
         : task_name_(task_name),
-          task_queue_(task_queue) {}
+          task_queue_(task_queue),
+          has_queue_(true) {}
+          
 #endif
 
+profiler_task::profiler_task(const char* task_name) 
+    : task_name_(task_name) {}
+
 profiler_task::~profiler_task() {
+    #ifdef ONEDAL_DATA_PARALLEL
+    if (has_queue_)
+        task_queue_.wait_and_throw();
+    #endif // ONEDAL_DATA_PARALLEL
     profiler::end_task(task_name_);
 }
 
diff --git a/cpp/oneapi/dal/detail/profiler.hpp b/cpp/oneapi/dal/detail/profiler.hpp
index 749d8f618a0..3eacba0ee63 100644
--- a/cpp/oneapi/dal/detail/profiler.hpp
+++ b/cpp/oneapi/dal/detail/profiler.hpp
@@ -19,6 +19,14 @@
 #ifdef ONEDAL_DATA_PARALLEL
 #include <sycl/sycl.hpp>
 #endif
+ 
+
+#include <sys/time.h>
+#include <time.h>
+#include <cstdint>
+#include <cstring>
+#include <map>
+#include <vector>
 
 #define ONEDAL_PROFILER_CONCAT2(x, y) x##y
 #define ONEDAL_PROFILER_CONCAT(x, y)  ONEDAL_PROFILER_CONCAT2(x, y)
@@ -39,6 +47,16 @@
 
 namespace oneapi::dal::detail {
 
+
+    
+struct task {
+    static const std::uint64_t MAX_KERNELS = 256;
+    std::map<const char*, std::uint64_t> kernels;
+    std::uint64_t current_kernel = 0;
+    std::uint64_t time_kernels[MAX_KERNELS];
+    void clear();
+};
+
 class profiler_task {
 public:
     profiler_task(const char* task_name);
@@ -47,24 +65,38 @@ class profiler_task {
 #endif
     ~profiler_task();
 
-    profiler_task(profiler_task& other) = delete;
-
-    profiler_task& operator=(profiler_task& other) = delete;
-
 private:
     const char* task_name_;
 #ifdef ONEDAL_DATA_PARALLEL
     sycl::queue task_queue_;
+    bool has_queue_;
 #endif
 };
 
 class profiler {
 public:
+    profiler();
+    ~profiler();
     static profiler_task start_task(const char* task_name);
+    static std::uint64_t get_time();
+    static profiler* get_instance();
+    task& get_task();
+
 #ifdef ONEDAL_DATA_PARALLEL
-    static profiler_task start_task(const char* task_name, const sycl::queue& task_queue);
+    sycl::queue& get_queue();
+    void set_queue(const sycl::queue& q);
+    
+
+    static profiler_task start_task(const char* task_name, sycl::queue& task_queue);
 #endif
     static void end_task(const char* task_name);
+
+private:
+    std::uint64_t start_time;
+    task task_;
+#ifdef ONEDAL_DATA_PARALLEL
+    sycl::queue queue_;
+#endif
 };
 
 } // namespace oneapi::dal::detail
diff --git a/docs/source/daal/algorithms/engines/index.rst b/docs/source/daal/algorithms/engines/index.rst
index 9def0af4c81..133513fd9f2 100644
--- a/docs/source/daal/algorithms/engines/index.rst
+++ b/docs/source/daal/algorithms/engines/index.rst
@@ -111,4 +111,6 @@ These methods are represented with member functions of classes that represent fu
 
     mt19937.rst
     mcg59.rst
+    mrg32k3a.rst
+    philox4x32x10.rst
     mt2203.rst
diff --git a/docs/source/daal/algorithms/engines/mrg32k3a.rst b/docs/source/daal/algorithms/engines/mrg32k3a.rst
new file mode 100644
index 00000000000..e931c801890
--- /dev/null
+++ b/docs/source/daal/algorithms/engines/mrg32k3a.rst
@@ -0,0 +1,62 @@
+.. Copyright contributors to the oneDAL project
+..
+.. Licensed under the Apache License, Version 2.0 (the "License");
+.. you may not use this file except in compliance with the License.
+.. You may obtain a copy of the License at
+..
+..     http://www.apache.org/licenses/LICENSE-2.0
+..
+.. Unless required by applicable law or agreed to in writing, software
+.. distributed under the License is distributed on an "AS IS" BASIS,
+.. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+.. See the License for the specific language governing permissions and
+.. limitations under the License.
+
+mrg32k3a
+========
+
+The engine based on a 32-bit combined multiple recursive generator
+with two components of order 3, optimized for batch processing.
+
+.. rubric:: Subsequence selection methods support
+
+skipAhead (nskip)
+    Supported
+leapfrog (threadIdx, nThreads)
+    Supported
+
+Batch Processing
+****************
+
+mrg32k3a engine needs the initial condition (``seed``) for state initialization.
+The seed can be either an integer scalar or a vector of :math:`p` integer elements, the inputs to the respective engine constructors.
+
+.. rubric:: Algorithm Parameters
+
+mrg32k3a engine has the following parameters:
+
+.. tabularcolumns::  |\Y{0.2}|\Y{0.2}|\Y{0.6}|
+
+.. list-table:: Algorithm Parameters for mcg58 engine (Batch Processing)
+   :header-rows: 1
+   :widths: 10 20 30
+   :align: left
+   :class: longtable
+
+   * - Parameter
+     - Default Value
+     - Description
+   * - ``algorithmFPType``
+     - ``float``
+     - The floating-point type that the algorithm uses for intermediate computations. Can be ``float`` or ``double``.
+   * - ``method``
+     - ``defaultDense``
+     - Performance-oriented computation method; the only method supported by the algorithm.
+   * - ``seed``
+     -
+       - :math:`777` for a scalar seed
+       - NA for a vector seed
+     - Initial condition for state initialization, scalar or vector:
+
+       - Scalar, value of ``size_t`` type
+       - Vector, pointer to ``HomogenNumericTable`` of size :math:`1 \times p`
diff --git a/docs/source/daal/algorithms/engines/philox4x32x10.rst b/docs/source/daal/algorithms/engines/philox4x32x10.rst
new file mode 100644
index 00000000000..ac50ea80fdb
--- /dev/null
+++ b/docs/source/daal/algorithms/engines/philox4x32x10.rst
@@ -0,0 +1,62 @@
+.. Copyright contributors to the oneDAL project
+..
+.. Licensed under the Apache License, Version 2.0 (the "License");
+.. you may not use this file except in compliance with the License.
+.. You may obtain a copy of the License at
+..
+..     http://www.apache.org/licenses/LICENSE-2.0
+..
+.. Unless required by applicable law or agreed to in writing, software
+.. distributed under the License is distributed on an "AS IS" BASIS,
+.. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+.. See the License for the specific language governing permissions and
+.. limitations under the License.
+
+philox4x32x10
+=============
+
+Implementation of the Philox4x32-10 engine: a counter-based pseudorandom number generator (PRNG)
+that uses 4x32-bit keys and performs 10 rounds of mixing to produce high-quality randomness.
+
+.. rubric:: Subsequence selection methods support
+
+skipAhead (nskip)
+    Supported
+leapfrog (threadIdx, nThreads)
+    Supported
+
+Batch Processing
+****************
+
+philox4x32x10 engine needs the initial condition (``seed``) for state initialization.
+The seed can be either an integer scalar or a vector of :math:`p` integer elements, the inputs to the respective engine constructors.
+
+.. rubric:: Algorithm Parameters
+
+philox4x32x10 engine has the following parameters:
+
+.. tabularcolumns::  |\Y{0.2}|\Y{0.2}|\Y{0.6}|
+
+.. list-table:: Algorithm Parameters for mcg58 engine (Batch Processing)
+   :header-rows: 1
+   :widths: 10 20 30
+   :align: left
+   :class: longtable
+
+   * - Parameter
+     - Default Value
+     - Description
+   * - ``algorithmFPType``
+     - ``float``
+     - The floating-point type that the algorithm uses for intermediate computations. Can be ``float`` or ``double``.
+   * - ``method``
+     - ``defaultDense``
+     - Performance-oriented computation method; the only method supported by the algorithm.
+   * - ``seed``
+     -
+       - :math:`777` for a scalar seed
+       - NA for a vector seed
+     - Initial condition for state initialization, scalar or vector:
+
+       - Scalar, value of ``size_t`` type
+       - Vector, pointer to ``HomogenNumericTable`` of size :math:`1 \times p`
diff --git a/makefile.lst b/makefile.lst
index 92dc52ff521..b042ede80a7 100755
--- a/makefile.lst
+++ b/makefile.lst
@@ -65,7 +65,7 @@ multiclassclassifier += classifier
 k_nearest_neighbors += engines classifier
 logistic_regression += classifier optimization_solver objective_function engines
 implicit_als += engines distributions
-engines += engines/mt19937 engines/mcg59 engines/mt2203
+engines += engines/mt19937 engines/mcg59 engines/mrg32k3a engines/philox4x32x10 engines/mt2203
 distributions += distributions/bernoulli distributions/normal distributions/uniform
 tsne +=
 
@@ -95,6 +95,8 @@ CORE.ALGORITHMS.FULL :=                                                       \
     elastic_net                                                               \
     engines                                                                   \
     engines/mcg59                                                             \
+    engines/mrg32k3a                                                          \
+    engines/philox4x32x10                                                     \
     engines/mt19937                                                           \
     engines/mt2203                                                            \
     em                                                                        \
@@ -309,6 +311,8 @@ JJ.ALGORITHMS       := adaboost
                        elastic_net/prediction                                    \
                        engines                                                   \
                        engines/mcg59                                             \
+                       engines/mrg32k3a                                          \
+                       engines/philox4x32x10                                     \
                        engines/mt19937                                           \
                        engines/mt2203                                            \
                        em_gmm                                                    \