diff --git a/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h b/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h new file mode 100644 index 00000000000..a70c1853e1a --- /dev/null +++ b/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a.h @@ -0,0 +1,183 @@ +/* file: mrg32k3a.h */ +/******************************************************************************* +* Copyright contributors to the oneDAL project +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +/* +//++ +// Implementation of the MRG32k3a engine: a 32-bit combined multiple recursive generator +// with two components of order 3, optimized for batch processing. +//-- +*/ + +#ifndef __MRG32K3A_H__ +#define __MRG32K3A_H__ + +#include "algorithms/engines/mrg32k3a/mrg32k3a_types.h" +#include "algorithms/engines/engine.h" + +namespace daal +{ +namespace algorithms +{ +namespace engines +{ +namespace mrg32k3a +{ +/** + * @defgroup engines_mrg32k3a_batch Batch + * @ingroup engines_mrg32k3a + * @{ + */ +namespace interface1 +{ +/** + * + * \brief Provides methods to run implementations of the mrg32k3a engine. + * This class is associated with the \ref mrg32k3a::interface1::Batch "mrg32k3a::Batch" class + * and supports the method of mrg32k3a engine computation in the batch processing mode + * + * \tparam algorithmFPType Data type to use in intermediate computations of mrg32k3a engine, double or float + * \tparam method Computation method of the engine, mrg32k3a::Method + * \tparam cpu Version of the cpu-specific implementation of the engine, daal::CpuType + */ +template +class BatchContainer : public daal::algorithms::AnalysisContainerIface +{ +public: + /** + * Constructs a container for the mrg32k3a engine with a specified environment + * in the batch processing mode + * \param[in] daalEnv Environment object + */ + BatchContainer(daal::services::Environment::env * daalEnv); + ~BatchContainer(); + /** + * Computes the result of the mrg32k3a engine in the batch processing mode + * + * \return Status of computations + */ + services::Status compute() DAAL_C11_OVERRIDE; +}; + +/** + * + * \brief Provides methods for mrg32k3a engine computations in the batch processing mode + * + * \tparam algorithmFPType Data type to use in intermediate computations of mrg32k3a engine, double or float + * \tparam method Computation method of the engine, mrg32k3a::Method + * + * \par Enumerations + * - mrg32k3a::Method Computation methods for the mrg32k3a engine + * + * \par References + * - \ref engines::interface1::Input "engines::Input" class + * - \ref engines::interface1::Result "engines::Result" class + */ +template +class DAAL_EXPORT Batch : public engines::BatchBase +{ +public: + typedef engines::BatchBase super; + + typedef typename super::InputType InputType; + typedef typename super::ResultType ResultType; + + /** + * Creates mrg32k3a engine + * \param[in] seed Initial condition for mrg32k3a engine + * + * \return Pointer to mrg32k3a engine + */ + static services::SharedPtr > create(size_t seed = 777); + + /** + * Returns method of the engine + * \return Method of the engine + */ + virtual int getMethod() const DAAL_C11_OVERRIDE { return (int)method; } + + /** + * Returns the structure that contains results of mrg32k3a engine + * \return Structure that contains results of mrg32k3a engine + */ + ResultPtr getResult() { return _result; } + + /** + * Registers user-allocated memory to store results of mrg32k3a engine + * \param[in] result Structure to store results of mrg32k3a engine + * + * \return Status of computations + */ + services::Status setResult(const ResultPtr & result) + { + DAAL_CHECK(result, services::ErrorNullResult) + _result = result; + _res = _result.get(); + return services::Status(); + } + + /** + * Returns a pointer to the newly allocated mrg32k3a engine + * with a copy of input objects and parameters of this mrg32k3a engine + * \return Pointer to the newly allocated engine + */ + services::SharedPtr > clone() const { return services::SharedPtr >(cloneImpl()); } + + /** + * Allocates memory to store the result of the mrg32k3a engine + * + * \return Status of computations + */ + virtual services::Status allocateResult() DAAL_C11_OVERRIDE + { + services::Status s = this->_result->template allocate(&(this->input), NULL, (int)method); + this->_res = this->_result.get(); + return s; + } + +protected: + Batch(size_t seed = 777) { initialize(); } + + Batch(const Batch & other) : super(other) { initialize(); } + + virtual Batch * cloneImpl() const DAAL_C11_OVERRIDE { return new Batch(*this); } + + void initialize() + { + Analysis::_ac = new __DAAL_ALGORITHM_CONTAINER(batch, BatchContainer, algorithmFPType, method)(&_env); + _in = &input; + _result.reset(new ResultType()); + } + +private: + ResultPtr _result; + + Batch & operator=(const Batch &); +}; +typedef services::SharedPtr > mrg32k3aPtr; +typedef services::SharedPtr > mrg32k3aConstPtr; + +} // namespace interface1 +using interface1::BatchContainer; +using interface1::Batch; +using interface1::mrg32k3aPtr; +using interface1::mrg32k3aConstPtr; +/** @} */ +} // namespace mrg32k3a +} // namespace engines +} // namespace algorithms +} // namespace daal +#endif diff --git a/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a_types.h b/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a_types.h new file mode 100644 index 00000000000..8fdc58b98c8 --- /dev/null +++ b/cpp/daal/include/algorithms/engines/mrg32k3a/mrg32k3a_types.h @@ -0,0 +1,65 @@ +/* file: mrg32k3a_types.h */ +/******************************************************************************* +* Copyright contributors to the oneDAL project +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +/* +//++ +// Implementation of the MRG32k3a engine: a 32-bit combined multiple recursive generator +// with two components of order 3, optimized for batch processing. +//-- +*/ + +#ifndef __MRG32K3A_TYPES_H__ +#define __MRG32K3A_TYPES_H__ + +#include "algorithms/algorithm.h" +#include "services/daal_defines.h" +#include "data_management/data/numeric_table.h" +#include "data_management/data/homogen_numeric_table.h" + +namespace daal +{ +namespace algorithms +{ +namespace engines +{ +/** + * @defgroup engines_mrg32k3a mrg32k3a Engine + * \copydoc daal::algorithms::engines::mrg32k3a + * @ingroup engines + * @{ + */ +/** + * \brief Contains classes for mrg32k3a engine + */ +namespace mrg32k3a +{ +/** + * + * Available methods to compute mrg32k3a engine + */ +enum Method +{ + defaultDense = 0 /*!< Default: performance-oriented method. */ +}; + +} // namespace mrg32k3a +/** @} */ +} // namespace engines +} // namespace algorithms +} // namespace daal + +#endif diff --git a/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10.h b/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10.h new file mode 100644 index 00000000000..3a5d0e33180 --- /dev/null +++ b/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10.h @@ -0,0 +1,183 @@ +/* file: philox4x32x10.h */ +/******************************************************************************* +* Copyright contributors to the oneDAL project +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +/* +//++ +// Implementation of the Philox4x32-10 engine: a counter-based pseudorandom number generator (PRNG) +// that uses 4x32-bit keys and performs 10 rounds of mixing to produce high-quality randomness. +//-- +*/ + +#ifndef __PHILOX4X32X10_H__ +#define __PHILOX4X32X10_H__ + +#include "algorithms/engines/philox4x32x10/philox4x32x10_types.h" +#include "algorithms/engines/engine.h" + +namespace daal +{ +namespace algorithms +{ +namespace engines +{ +namespace philox4x32x10 +{ +/** + * @defgroup engines_philox4x32x10_batch Batch + * @ingroup engines_philox4x32x10 + * @{ + */ +namespace interface1 +{ +/** + * + * \brief Provides methods to run implementations of the philox4x32x10 engine. + * This class is associated with the \ref philox4x32x10::interface1::Batch "philox4x32x10::Batch" class + * and supports the method of philox4x32x10 engine computation in the batch processing mode + * + * \tparam algorithmFPType Data type to use in intermediate computations of philox4x32x10 engine, double or float + * \tparam method Computation method of the engine, philox4x32x10::Method + * \tparam cpu Version of the cpu-specific implementation of the engine, daal::CpuType + */ +template +class BatchContainer : public daal::algorithms::AnalysisContainerIface +{ +public: + /** + * Constructs a container for the philox4x32x10 engine with a specified environment + * in the batch processing mode + * \param[in] daalEnv Environment object + */ + BatchContainer(daal::services::Environment::env * daalEnv); + ~BatchContainer(); + /** + * Computes the result of the philox4x32x10 engine in the batch processing mode + * + * \return Status of computations + */ + services::Status compute() DAAL_C11_OVERRIDE; +}; + +/** + * + * \brief Provides methods for philox4x32x10 engine computations in the batch processing mode + * + * \tparam algorithmFPType Data type to use in intermediate computations of philox4x32x10 engine, double or float + * \tparam method Computation method of the engine, philox4x32x10::Method + * + * \par Enumerations + * - philox4x32x10::Method Computation methods for the philox4x32x10 engine + * + * \par References + * - \ref engines::interface1::Input "engines::Input" class + * - \ref engines::interface1::Result "engines::Result" class + */ +template +class DAAL_EXPORT Batch : public engines::BatchBase +{ +public: + typedef engines::BatchBase super; + + typedef typename super::InputType InputType; + typedef typename super::ResultType ResultType; + + /** + * Creates philox4x32x10 engine + * \param[in] seed Initial condition for philox4x32x10 engine + * + * \return Pointer to philox4x32x10 engine + */ + static services::SharedPtr > create(size_t seed = 777); + + /** + * Returns method of the engine + * \return Method of the engine + */ + virtual int getMethod() const DAAL_C11_OVERRIDE { return (int)method; } + + /** + * Returns the structure that contains results of philox4x32x10 engine + * \return Structure that contains results of philox4x32x10 engine + */ + ResultPtr getResult() { return _result; } + + /** + * Registers user-allocated memory to store results of philox4x32x10 engine + * \param[in] result Structure to store results of philox4x32x10 engine + * + * \return Status of computations + */ + services::Status setResult(const ResultPtr & result) + { + DAAL_CHECK(result, services::ErrorNullResult) + _result = result; + _res = _result.get(); + return services::Status(); + } + + /** + * Returns a pointer to the newly allocated philox4x32x10 engine + * with a copy of input objects and parameters of this philox4x32x10 engine + * \return Pointer to the newly allocated engine + */ + services::SharedPtr > clone() const { return services::SharedPtr >(cloneImpl()); } + + /** + * Allocates memory to store the result of the philox4x32x10 engine + * + * \return Status of computations + */ + virtual services::Status allocateResult() DAAL_C11_OVERRIDE + { + services::Status s = this->_result->template allocate(&(this->input), NULL, (int)method); + this->_res = this->_result.get(); + return s; + } + +protected: + Batch(size_t seed = 777) { initialize(); } + + Batch(const Batch & other) : super(other) { initialize(); } + + virtual Batch * cloneImpl() const DAAL_C11_OVERRIDE { return new Batch(*this); } + + void initialize() + { + Analysis::_ac = new __DAAL_ALGORITHM_CONTAINER(batch, BatchContainer, algorithmFPType, method)(&_env); + _in = &input; + _result.reset(new ResultType()); + } + +private: + ResultPtr _result; + + Batch & operator=(const Batch &); +}; +typedef services::SharedPtr > philox4x32x10Ptr; +typedef services::SharedPtr > philox4x32x10ConstPtr; + +} // namespace interface1 +using interface1::BatchContainer; +using interface1::Batch; +using interface1::philox4x32x10Ptr; +using interface1::philox4x32x10ConstPtr; +/** @} */ +} // namespace philox4x32x10 +} // namespace engines +} // namespace algorithms +} // namespace daal +#endif diff --git a/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10_types.h b/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10_types.h new file mode 100644 index 00000000000..0c0a92c9b3a --- /dev/null +++ b/cpp/daal/include/algorithms/engines/philox4x32x10/philox4x32x10_types.h @@ -0,0 +1,65 @@ +/* file: philox4x32x10_types.h */ +/******************************************************************************* +* Copyright contributors to the oneDAL project +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +/* +//++ +// Implementation of the Philox4x32-10 engine: a counter-based pseudorandom number generator (PRNG) +// that uses 4x32-bit keys and performs 10 rounds of mixing to produce high-quality randomness. +//-- +*/ + +#ifndef __PHILOX4X32X10_TYPES_H__ +#define __PHILOX4X32X10_TYPES_H__ + +#include "algorithms/algorithm.h" +#include "services/daal_defines.h" +#include "data_management/data/numeric_table.h" +#include "data_management/data/homogen_numeric_table.h" + +namespace daal +{ +namespace algorithms +{ +namespace engines +{ +/** + * @defgroup engines_philox4x32x10 philox4x32x10 Engine + * \copydoc daal::algorithms::engines::philox4x32x10 + * @ingroup engines + * @{ + */ +/** + * \brief Contains classes for philox4x32x10 engine + */ +namespace philox4x32x10 +{ +/** + * + * Available methods to compute philox4x32x10 engine + */ +enum Method +{ + defaultDense = 0 /*!< Default: performance-oriented method. */ +}; + +} // namespace philox4x32x10 +/** @} */ +} // namespace engines +} // namespace algorithms +} // namespace daal + +#endif diff --git a/cpp/daal/include/daal.h b/cpp/daal/include/daal.h index 881a6c39fbe..f49625f8939 100755 --- a/cpp/daal/include/daal.h +++ b/cpp/daal/include/daal.h @@ -297,13 +297,17 @@ #include "algorithms/distributions/bernoulli/bernoulli.h" #include "algorithms/distributions/bernoulli/bernoulli_types.h" #include "algorithms/engines/engine.h" +#include "algorithms/engines/engine_family.h" +#include "algorithms/engines/mt2203/mt2203.h" +#include "algorithms/engines/mt2203/mt2203_types.h" #include "algorithms/engines/mt19937/mt19937.h" #include "algorithms/engines/mt19937/mt19937_types.h" #include "algorithms/engines/mcg59/mcg59.h" #include "algorithms/engines/mcg59/mcg59_types.h" -#include "algorithms/engines/engine_family.h" -#include "algorithms/engines/mt2203/mt2203.h" -#include "algorithms/engines/mt2203/mt2203_types.h" +#include "algorithms/engines/mrg32k3a/mrg32k3a.h" +#include "algorithms/engines/mrg32k3a/mrg32k3a_types.h" +#include "algorithms/engines/philox4x32x10/philox4x32x10.h" +#include "algorithms/engines/philox4x32x10/philox4x32x10_types.h" #include "algorithms/dbscan/dbscan_types.h" #include "algorithms/dbscan/dbscan_batch.h" #include "algorithms/dbscan/dbscan_distributed.h" diff --git a/cpp/daal/include/daal_win.h b/cpp/daal/include/daal_win.h index e17eff16796..a15ed7db26e 100755 --- a/cpp/daal/include/daal_win.h +++ b/cpp/daal/include/daal_win.h @@ -309,13 +309,17 @@ #include "algorithms/distributions/bernoulli/bernoulli.h" #include "algorithms/distributions/bernoulli/bernoulli_types.h" #include "algorithms/engines/engine.h" +#include "algorithms/engines/engine_family.h" +#include "algorithms/engines/mt2203/mt2203.h" +#include "algorithms/engines/mt2203/mt2203_types.h" #include "algorithms/engines/mt19937/mt19937.h" #include "algorithms/engines/mt19937/mt19937_types.h" #include "algorithms/engines/mcg59/mcg59.h" #include "algorithms/engines/mcg59/mcg59_types.h" -#include "algorithms/engines/engine_family.h" -#include "algorithms/engines/mt2203/mt2203.h" -#include "algorithms/engines/mt2203/mt2203_types.h" +#include "algorithms/engines/mrg32k3a/mrg32k3a.h" +#include "algorithms/engines/mrg32k3a/mrg32k3a_types.h" +#include "algorithms/engines/philox4x32x10/philox4x32x10.h" +#include "algorithms/engines/philox4x32x10/philox4x32x10_types.h" #include "algorithms/dbscan/dbscan_types.h" #include "algorithms/dbscan/dbscan_batch.h" #include "algorithms/dbscan/dbscan_distributed.h" diff --git a/cpp/daal/src/algorithms/engines/mcg59/mcg59_batch_impl.h b/cpp/daal/src/algorithms/engines/mcg59/mcg59_batch_impl.h index 6c3040da615..62f337ba9a0 100644 --- a/cpp/daal/src/algorithms/engines/mcg59/mcg59_batch_impl.h +++ b/cpp/daal/src/algorithms/engines/mcg59/mcg59_batch_impl.h @@ -26,9 +26,6 @@ #include "src/externals/service_rng.h" #include "src/data_management/service_numeric_table.h" -static const int leapfrogMethodErrcode = -1002; -static const int skipAheadMethodErrcode = -1003; - namespace daal { namespace algorithms @@ -67,7 +64,7 @@ class BatchImpl : public algorithms::engines::mcg59::interface1::Batch +SharedPtr > Batch::create(size_t seed) +{ + SharedPtr > engPtr; +#define DAAL_CREATE_ENGINE_CPU(cpuId, ...) engPtr.reset(new BatchImpl(__VA_ARGS__)); + + DAAL_DISPATCH_FUNCTION_BY_CPU(DAAL_CREATE_ENGINE_CPU, seed); + +#undef DAAL_CREATE_ENGINE_CPU + return engPtr; +} + +template class Batch; +template class Batch; + +} // namespace interface1 +} // namespace mrg32k3a +} // namespace engines +} // namespace algorithms +} // namespace daal diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_container.h b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_container.h new file mode 100644 index 00000000000..ce83f554026 --- /dev/null +++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_container.h @@ -0,0 +1,68 @@ +/* file: mrg32k3a_batch_container.h */ +/******************************************************************************* +* Copyright contributors to the oneDAL project +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +/* +//++ +// Implementation of mrg32k3a calculation algorithm container. +//-- +*/ + +#ifndef __mrg32k3a_BATCH_CONTAINER_H__ +#define __mrg32k3a_BATCH_CONTAINER_H__ + +#include "algorithms/engines/mrg32k3a/mrg32k3a.h" +#include "src/algorithms/engines/mrg32k3a/mrg32k3a_kernel.h" + +namespace daal +{ +namespace algorithms +{ +namespace engines +{ +namespace mrg32k3a +{ +namespace interface1 +{ +template +BatchContainer::BatchContainer(daal::services::Environment::env * daalEnv) : AnalysisContainerIface(daalEnv) +{ + __DAAL_INITIALIZE_KERNELS(internal::mrg32k3aKernel, algorithmFPType, method); +} + +template +BatchContainer::~BatchContainer() +{ + __DAAL_DEINITIALIZE_KERNELS(); +} + +template +services::Status BatchContainer::compute() +{ + daal::services::Environment::env & env = *_env; + engines::Result * result = static_cast(_res); + NumericTable * resultTable = result->get(engines::randomNumbers).get(); + + __DAAL_CALL_KERNEL(env, internal::mrg32k3aKernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, resultTable); +} + +} // namespace interface1 +} // namespace mrg32k3a +} // namespace engines +} // namespace algorithms +} // namespace daal + +#endif diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h new file mode 100644 index 00000000000..9c226e54af3 --- /dev/null +++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_batch_impl.h @@ -0,0 +1,114 @@ +/* file: mrg32k3a_batch_impl.h */ +/******************************************************************************* +* Copyright contributors to the oneDAL project +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +/* +//++ +// Implementation of the class defining the mrg32k3a engine. +//-- +*/ + +#include "algorithms/engines/mrg32k3a/mrg32k3a.h" +#include "src/algorithms/engines/engine_batch_impl.h" +#include "src/externals/service_rng.h" +#include "src/data_management/service_numeric_table.h" + +namespace daal +{ +namespace algorithms +{ +namespace engines +{ +namespace mrg32k3a +{ +namespace internal +{ +template +class BatchImpl : public algorithms::engines::mrg32k3a::interface1::Batch, + public algorithms::engines::internal::BatchBaseImpl +{ +public: + typedef algorithms::engines::mrg32k3a::interface1::Batch super1; + typedef algorithms::engines::internal::BatchBaseImpl super2; + BatchImpl(size_t seed = 777) : baseRng(seed, __DAAL_BRNG_MRG32K3A), super2(seed) {} + + void * getState() DAAL_C11_OVERRIDE { return baseRng.getState(); } + + int getStateSize() const DAAL_C11_OVERRIDE { return baseRng.getStateSize(); } + + services::Status saveStateImpl(byte * dest) const DAAL_C11_OVERRIDE + { + DAAL_CHECK(!baseRng.saveState((void *)dest), ErrorIncorrectErrorcodeFromGenerator); + return services::Status(); + } + + services::Status loadStateImpl(const byte * src) DAAL_C11_OVERRIDE + { + DAAL_CHECK(!baseRng.loadState((const void *)src), ErrorIncorrectErrorcodeFromGenerator); + return services::Status(); + } + + services::Status leapfrogImpl(size_t threadNum, size_t nThreads) DAAL_C11_OVERRIDE + { + int errcode = baseRng.leapfrog(threadNum, nThreads); + services::Status s; + if (errcode == __DAAL_RNG_ERROR_LEAPFROG_UNSUPPORTED) + s.add(ErrorLeapfrogUnsupported); + else if (errcode) + s.add(ErrorIncorrectErrorcodeFromGenerator); + return s; + } + + services::Status skipAheadImpl(size_t nSkip) DAAL_C11_OVERRIDE + { + int errcode = baseRng.skipAhead(nSkip); + services::Status s; + if (errcode == __DAAL_RNG_ERROR_SKIPAHEAD_UNSUPPORTED) + s.add(ErrorSkipAheadUnsupported); + else if (errcode) + s.add(ErrorIncorrectErrorcodeFromGenerator); + return s; + } + + virtual BatchImpl * cloneImpl() const DAAL_C11_OVERRIDE + { + return new BatchImpl(*this); + } + + bool hasSupport(engines::internal::ParallelizationTechnique technique) const DAAL_C11_OVERRIDE + { + switch (technique) + { + case engines::internal::family: return false; + case engines::internal::skipahead: return true; + case engines::internal::leapfrog: return true; + } + return false; + } + + ~BatchImpl() {} + +protected: + BatchImpl(const BatchImpl & other) : super1(other), super2(other), baseRng(other.baseRng) {} + + daal::internal::BaseRNGsInst baseRng; +}; + +} // namespace internal +} // namespace mrg32k3a +} // namespace engines +} // namespace algorithms +} // namespace daal diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_cpu.cpp b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_cpu.cpp new file mode 100644 index 00000000000..529c4af2635 --- /dev/null +++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_cpu.cpp @@ -0,0 +1,47 @@ +/* file: mrg32k3a_dense_default_batch_fpt_cpu.cpp */ +/******************************************************************************* +* Copyright contributors to the oneDAL project +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +//++ +// Implementation of mrg32k3a calculation functions. +//-- + +#include "src/algorithms/engines/mrg32k3a/mrg32k3a_batch_container.h" +#include "src/algorithms/engines/mrg32k3a/mrg32k3a_kernel.h" +#include "src/algorithms/engines/mrg32k3a/mrg32k3a_impl.i" + +namespace daal +{ +namespace algorithms +{ +namespace engines +{ +namespace mrg32k3a +{ +namespace interface1 +{ +template class BatchContainer; +} // namespace interface1 + +namespace internal +{ +template class mrg32k3aKernel; +} // namespace internal + +} // namespace mrg32k3a +} // namespace engines +} // namespace algorithms +} // namespace daal diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_dispatcher.cpp new file mode 100644 index 00000000000..fd78108df73 --- /dev/null +++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_dense_default_batch_fpt_dispatcher.cpp @@ -0,0 +1,30 @@ +/* file: mrg32k3a_dense_default_batch_fpt_dispatcher.cpp */ +/******************************************************************************* +* Copyright contributors to the oneDAL project +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +//++ +// Implementation of mrg32k3a calculation algorithm dispatcher. +//-- + +#include "src/algorithms/engines/mrg32k3a/mrg32k3a_batch_container.h" + +namespace daal +{ +namespace algorithms +{ +__DAAL_INSTANTIATE_DISPATCH_CONTAINER(engines::mrg32k3a::BatchContainer, batch, DAAL_FPTYPE, engines::mrg32k3a::defaultDense) +} // namespace algorithms +} // namespace daal diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_impl.i b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_impl.i new file mode 100644 index 00000000000..f8f12b2deea --- /dev/null +++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_impl.i @@ -0,0 +1,49 @@ +/* file: mrg32k3a_impl.i */ +/******************************************************************************* +* Copyright contributors to the oneDAL project +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +/* +//++ +// Implementation of mrg32k3a algorithm. +//-- +*/ + +#ifndef __MRG32K3A_IMPL_I__ +#define __MRG32K3A_IMPL_I__ + +namespace daal +{ +namespace algorithms +{ +namespace engines +{ +namespace mrg32k3a +{ +namespace internal +{ +template +Status mrg32k3aKernel::compute(NumericTable * resultTensor) +{ + return Status(); +} + +} // namespace internal +} // namespace mrg32k3a +} // namespace engines +} // namespace algorithms +} // namespace daal + +#endif diff --git a/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_kernel.h b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_kernel.h new file mode 100644 index 00000000000..80c9fbe44d9 --- /dev/null +++ b/cpp/daal/src/algorithms/engines/mrg32k3a/mrg32k3a_kernel.h @@ -0,0 +1,58 @@ +/* file: mrg32k3a_kernel.h */ +/******************************************************************************* +* Copyright contributors to the oneDAL project +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +//++ +// Declaration of a template function for calculating values using the MRG32k3a generator. +//-- + +#ifndef __MRG32K3A_KERNEL_H__ +#define __MRG32K3A_KERNEL_H__ + +#include "algorithms/engines/mrg32k3a/mrg32k3a.h" +#include "src/algorithms/kernel.h" +#include "data_management/data/numeric_table.h" + +using namespace daal::services; +using namespace daal::data_management; + +namespace daal +{ +namespace algorithms +{ +namespace engines +{ +namespace mrg32k3a +{ +namespace internal +{ +/** + * \brief Kernel for mrg32k3a calculation + */ +template +class mrg32k3aKernel : public Kernel +{ +public: + Status compute(NumericTable * resultTable); +}; + +} // namespace internal +} // namespace mrg32k3a +} // namespace engines +} // namespace algorithms +} // namespace daal + +#endif diff --git a/cpp/daal/src/algorithms/engines/mt19937/mt19937_batch_impl.h b/cpp/daal/src/algorithms/engines/mt19937/mt19937_batch_impl.h index e92d0e46612..805ded3153c 100644 --- a/cpp/daal/src/algorithms/engines/mt19937/mt19937_batch_impl.h +++ b/cpp/daal/src/algorithms/engines/mt19937/mt19937_batch_impl.h @@ -26,9 +26,6 @@ #include "src/externals/service_rng.h" #include "src/data_management/service_numeric_table.h" -static const int leapfrogMethodErrcode = -1002; -static const int skipAheadMethodErrcode = -1003; - namespace daal { namespace algorithms @@ -67,7 +64,7 @@ class BatchImpl : public algorithms::engines::mt19937::interface1::Batchleapfrog(threadNum, nThreads); services::Status s; - if (errcode == leapfrogMethodErrcode) + if (errcode == __DAAL_RNG_ERROR_LEAPFROG_UNSUPPORTED) s.add(ErrorLeapfrogUnsupported); else if (errcode) s.add(ErrorIncorrectErrorcodeFromGenerator); @@ -199,7 +196,7 @@ class BatchImpl : public algorithms::engines::mt2203::interface1::BatchskipAhead(nSkip); services::Status s; - if (errcode == skipAheadMethodErrcode) + if (errcode == __DAAL_RNG_ERROR_SKIPAHEAD_UNSUPPORTED) s.add(ErrorSkipAheadUnsupported); else if (errcode) s.add(ErrorIncorrectErrorcodeFromGenerator); diff --git a/cpp/daal/src/algorithms/engines/mt2203/mt2203_kernel.h b/cpp/daal/src/algorithms/engines/mt2203/mt2203_kernel.h index b7de119367f..e588a02c8fb 100644 --- a/cpp/daal/src/algorithms/engines/mt2203/mt2203_kernel.h +++ b/cpp/daal/src/algorithms/engines/mt2203/mt2203_kernel.h @@ -19,8 +19,8 @@ // Declaration of template function that calculate mt2203s. //-- -#ifndef __MCG59_KERNEL_H__ -#define __MCG59_KERNEL_H__ +#ifndef __MT2203_KERNEL_H__ +#define __MT2203_KERNEL_H__ #include "algorithms/engines/mt2203/mt2203.h" #include "src/algorithms/kernel.h" diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10.cpp b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10.cpp new file mode 100644 index 00000000000..47fb7dae70f --- /dev/null +++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10.cpp @@ -0,0 +1,59 @@ +/* file: philox4x32x10.cpp */ +/******************************************************************************* +* Copyright contributors to the oneDAL project +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +//++ +// Implementation of the Philox4x32-10 engine: a counter-based pseudorandom number generator (PRNG) +// that uses 4x32-bit keys and performs 10 rounds of mixing to produce high-quality randomness. +//-- + +#include "algorithms/engines/philox4x32x10/philox4x32x10.h" +#include "src/externals/service_dispatch.h" +#include "src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h" + +namespace daal +{ +namespace algorithms +{ +namespace engines +{ +namespace philox4x32x10 +{ +namespace interface1 +{ +using namespace daal::services; +using namespace philox4x32x10::internal; + +template +SharedPtr > Batch::create(size_t seed) +{ + SharedPtr > engPtr; +#define DAAL_CREATE_ENGINE_CPU(cpuId, ...) engPtr.reset(new BatchImpl(__VA_ARGS__)); + + DAAL_DISPATCH_FUNCTION_BY_CPU(DAAL_CREATE_ENGINE_CPU, seed); + +#undef DAAL_CREATE_ENGINE_CPU + return engPtr; +} + +template class Batch; +template class Batch; + +} // namespace interface1 +} // namespace philox4x32x10 +} // namespace engines +} // namespace algorithms +} // namespace daal diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_container.h b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_container.h new file mode 100644 index 00000000000..9cb747e95a8 --- /dev/null +++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_container.h @@ -0,0 +1,68 @@ +/* file: philox4x32x10_batch_container.h */ +/******************************************************************************* +* Copyright contributors to the oneDAL project +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +/* +//++ +// Implementation of philox4x32x10 calculation algorithm container. +//-- +*/ + +#ifndef __PHILOX4X32X10_BATCH_CONTAINER_H__ +#define __PHILOX4X32X10_BATCH_CONTAINER_H__ + +#include "algorithms/engines/philox4x32x10/philox4x32x10.h" +#include "src/algorithms/engines/philox4x32x10/philox4x32x10_kernel.h" + +namespace daal +{ +namespace algorithms +{ +namespace engines +{ +namespace philox4x32x10 +{ +namespace interface1 +{ +template +BatchContainer::BatchContainer(daal::services::Environment::env * daalEnv) : AnalysisContainerIface(daalEnv) +{ + __DAAL_INITIALIZE_KERNELS(internal::philox4x32x10Kernel, algorithmFPType, method); +} + +template +BatchContainer::~BatchContainer() +{ + __DAAL_DEINITIALIZE_KERNELS(); +} + +template +services::Status BatchContainer::compute() +{ + daal::services::Environment::env & env = *_env; + engines::Result * result = static_cast(_res); + NumericTable * resultTable = result->get(engines::randomNumbers).get(); + + __DAAL_CALL_KERNEL(env, internal::philox4x32x10Kernel, __DAAL_KERNEL_ARGUMENTS(algorithmFPType, method), compute, resultTable); +} + +} // namespace interface1 +} // namespace philox4x32x10 +} // namespace engines +} // namespace algorithms +} // namespace daal + +#endif diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h new file mode 100644 index 00000000000..1f7b40526ac --- /dev/null +++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_batch_impl.h @@ -0,0 +1,114 @@ +/* file: philox4x32x10_batch_impl.h */ +/******************************************************************************* +* Copyright contributors to the oneDAL project +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +/* +//++ +// Implementation of the class defining the philox4x32x10 engine +//-- +*/ + +#include "algorithms/engines/philox4x32x10/philox4x32x10.h" +#include "src/algorithms/engines/engine_batch_impl.h" +#include "src/externals/service_rng.h" +#include "src/data_management/service_numeric_table.h" + +namespace daal +{ +namespace algorithms +{ +namespace engines +{ +namespace philox4x32x10 +{ +namespace internal +{ +template +class BatchImpl : public algorithms::engines::philox4x32x10::interface1::Batch, + public algorithms::engines::internal::BatchBaseImpl +{ +public: + typedef algorithms::engines::philox4x32x10::interface1::Batch super1; + typedef algorithms::engines::internal::BatchBaseImpl super2; + BatchImpl(size_t seed = 777) : baseRng(seed, __DAAL_BRNG_PHILOX4X32X10), super2(seed) {} + + void * getState() DAAL_C11_OVERRIDE { return baseRng.getState(); } + + int getStateSize() const DAAL_C11_OVERRIDE { return baseRng.getStateSize(); } + + services::Status saveStateImpl(byte * dest) const DAAL_C11_OVERRIDE + { + DAAL_CHECK(!baseRng.saveState((void *)dest), ErrorIncorrectErrorcodeFromGenerator); + return services::Status(); + } + + services::Status loadStateImpl(const byte * src) DAAL_C11_OVERRIDE + { + DAAL_CHECK(!baseRng.loadState((const void *)src), ErrorIncorrectErrorcodeFromGenerator); + return services::Status(); + } + + services::Status leapfrogImpl(size_t threadNum, size_t nThreads) DAAL_C11_OVERRIDE + { + int errcode = baseRng.leapfrog(threadNum, nThreads); + services::Status s; + if (errcode == __DAAL_RNG_ERROR_LEAPFROG_UNSUPPORTED) + s.add(ErrorLeapfrogUnsupported); + else if (errcode) + s.add(ErrorIncorrectErrorcodeFromGenerator); + return s; + } + + services::Status skipAheadImpl(size_t nSkip) DAAL_C11_OVERRIDE + { + int errcode = baseRng.skipAhead(nSkip); + services::Status s; + if (errcode == __DAAL_RNG_ERROR_SKIPAHEAD_UNSUPPORTED) + s.add(ErrorSkipAheadUnsupported); + else if (errcode) + s.add(ErrorIncorrectErrorcodeFromGenerator); + return s; + } + + virtual BatchImpl * cloneImpl() const DAAL_C11_OVERRIDE + { + return new BatchImpl(*this); + } + + bool hasSupport(engines::internal::ParallelizationTechnique technique) const DAAL_C11_OVERRIDE + { + switch (technique) + { + case engines::internal::family: return false; + case engines::internal::skipahead: return true; + case engines::internal::leapfrog: return false; + } + return false; + } + + ~BatchImpl() {} + +protected: + BatchImpl(const BatchImpl & other) : super1(other), super2(other), baseRng(other.baseRng) {} + + daal::internal::BaseRNGsInst baseRng; +}; + +} // namespace internal +} // namespace philox4x32x10 +} // namespace engines +} // namespace algorithms +} // namespace daal diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_cpu.cpp b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_cpu.cpp new file mode 100644 index 00000000000..946517c1d9c --- /dev/null +++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_cpu.cpp @@ -0,0 +1,47 @@ +/* file: philox4x32x10_dense_default_batch_fpt_cpu.cpp */ +/******************************************************************************* +* Copyright contributors to the oneDAL project +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +//++ +// Implementation of philox4x32x10 calculation functions. +//-- + +#include "src/algorithms/engines/philox4x32x10/philox4x32x10_batch_container.h" +#include "src/algorithms/engines/philox4x32x10/philox4x32x10_kernel.h" +#include "src/algorithms/engines/philox4x32x10/philox4x32x10_impl.i" + +namespace daal +{ +namespace algorithms +{ +namespace engines +{ +namespace philox4x32x10 +{ +namespace interface1 +{ +template class BatchContainer; +} // namespace interface1 + +namespace internal +{ +template class philox4x32x10Kernel; +} // namespace internal + +} // namespace philox4x32x10 +} // namespace engines +} // namespace algorithms +} // namespace daal diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_dispatcher.cpp b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_dispatcher.cpp new file mode 100644 index 00000000000..1640fc4ec12 --- /dev/null +++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_dense_default_batch_fpt_dispatcher.cpp @@ -0,0 +1,30 @@ +/* file: philox4x32x10_dense_default_batch_fpt_dispatcher.cpp */ +/******************************************************************************* +* Copyright contributors to the oneDAL project +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +//++ +// Implementation of philox4x32x10 calculation algorithm dispatcher. +//-- + +#include "src/algorithms/engines/philox4x32x10/philox4x32x10_batch_container.h" + +namespace daal +{ +namespace algorithms +{ +__DAAL_INSTANTIATE_DISPATCH_CONTAINER(engines::philox4x32x10::BatchContainer, batch, DAAL_FPTYPE, engines::philox4x32x10::defaultDense) +} // namespace algorithms +} // namespace daal diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_impl.i b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_impl.i new file mode 100644 index 00000000000..5aa5addc22b --- /dev/null +++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_impl.i @@ -0,0 +1,49 @@ +/* file: philox4x32x10_impl.i */ +/******************************************************************************* +* Copyright contributors to the oneDAL project +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +/* +//++ +// Implementation of philox4x32x10 algorithm. +//-- +*/ + +#ifndef __PHILOX4X32X10_IMPL_I__ +#define __PHILOX4X32X10_IMPL_I__ + +namespace daal +{ +namespace algorithms +{ +namespace engines +{ +namespace philox4x32x10 +{ +namespace internal +{ +template +Status philox4x32x10Kernel::compute(NumericTable * resultTensor) +{ + return Status(); +} + +} // namespace internal +} // namespace philox4x32x10 +} // namespace engines +} // namespace algorithms +} // namespace daal + +#endif diff --git a/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_kernel.h b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_kernel.h new file mode 100644 index 00000000000..5870d781abd --- /dev/null +++ b/cpp/daal/src/algorithms/engines/philox4x32x10/philox4x32x10_kernel.h @@ -0,0 +1,58 @@ +/* file: philox4x32x10_kernel.h */ +/******************************************************************************* +* Copyright contributors to the oneDAL project +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +//++ +// Declaration of a template function for generating values using the Philox4x32-10 engine. +//-- + +#ifndef __PHILOX4X32X10_KERNEL_H__ +#define __PHILOX4X32X10_KERNEL_H__ + +#include "algorithms/engines/philox4x32x10/philox4x32x10.h" +#include "src/algorithms/kernel.h" +#include "data_management/data/numeric_table.h" + +using namespace daal::services; +using namespace daal::data_management; + +namespace daal +{ +namespace algorithms +{ +namespace engines +{ +namespace philox4x32x10 +{ +namespace internal +{ +/** + * \brief Kernel for philox4x32x10 calculation + */ +template +class philox4x32x10Kernel : public Kernel +{ +public: + Status compute(NumericTable * resultTable); +}; + +} // namespace internal +} // namespace philox4x32x10 +} // namespace engines +} // namespace algorithms +} // namespace daal + +#endif diff --git a/cpp/daal/src/externals/service_rng_mkl.h b/cpp/daal/src/externals/service_rng_mkl.h index b2dcd81b78b..425695c7f66 100644 --- a/cpp/daal/src/externals/service_rng_mkl.h +++ b/cpp/daal/src/externals/service_rng_mkl.h @@ -32,6 +32,8 @@ #define __DAAL_BRNG_MT2203 VSL_BRNG_MT2203 #define __DAAL_BRNG_MT19937 VSL_BRNG_MT19937 #define __DAAL_BRNG_MCG59 VSL_BRNG_MCG59 +#define __DAAL_BRNG_MRG32K3A VSL_BRNG_MRG32K3A +#define __DAAL_BRNG_PHILOX4X32X10 VSL_BRNG_PHILOX4X32X10 #define __DAAL_RNG_METHOD_UNIFORM_STD VSL_RNG_METHOD_UNIFORM_STD #define __DAAL_RNG_METHOD_UNIFORMBITS32_STD 0 #define __DAAL_RNG_METHOD_BERNOULLI_ICDF VSL_RNG_METHOD_BERNOULLI_ICDF @@ -39,6 +41,10 @@ #define __DAAL_RNG_METHOD_GAUSSIAN_BOXMULLER2 VSL_RNG_METHOD_GAUSSIAN_BOXMULLER2 #define __DAAL_RNG_METHOD_GAUSSIAN_ICDF VSL_RNG_METHOD_GAUSSIAN_ICDF +// Errors +#define __DAAL_RNG_ERROR_LEAPFROG_UNSUPPORTED -1002 +#define __DAAL_RNG_ERROR_SKIPAHEAD_UNSUPPORTED -1003 + namespace daal { namespace internal diff --git a/cpp/daal/src/externals/service_rng_openrng.h b/cpp/daal/src/externals/service_rng_openrng.h index dd70c644606..0e49c62c83b 100644 --- a/cpp/daal/src/externals/service_rng_openrng.h +++ b/cpp/daal/src/externals/service_rng_openrng.h @@ -25,6 +25,8 @@ #define __DAAL_BRNG_MT2203 VSL_BRNG_MT2203 #define __DAAL_BRNG_MT19937 VSL_BRNG_MT19937 #define __DAAL_BRNG_MCG59 VSL_BRNG_MCG59 +#define __DAAL_BRNG_MRG32K3A VSL_BRNG_MRG32K3A +#define __DAAL_BRNG_PHILOX4X32X10 VSL_BRNG_PHILOX4X32X10 #define __DAAL_RNG_METHOD_UNIFORM_STD VSL_RNG_METHOD_UNIFORM_STD #define __DAAL_RNG_METHOD_UNIFORMBITS32_STD 0 #define __DAAL_RNG_METHOD_BERNOULLI_ICDF VSL_RNG_METHOD_BERNOULLI_ICDF @@ -32,6 +34,10 @@ #define __DAAL_RNG_METHOD_GAUSSIAN_BOXMULLER2 VSL_RNG_METHOD_GAUSSIAN_BOXMULLER2 #define __DAAL_RNG_METHOD_GAUSSIAN_ICDF VSL_RNG_METHOD_GAUSSIAN_ICDF +// Errors +#define __DAAL_RNG_ERROR_LEAPFROG_UNSUPPORTED -1002 +#define __DAAL_RNG_ERROR_SKIPAHEAD_UNSUPPORTED -1003 + namespace daal { namespace internal diff --git a/cpp/daal/src/externals/service_rng_ref.h b/cpp/daal/src/externals/service_rng_ref.h index fc56fcf6205..f2df997c87d 100644 --- a/cpp/daal/src/externals/service_rng_ref.h +++ b/cpp/daal/src/externals/service_rng_ref.h @@ -36,16 +36,22 @@ #include // RNGs - #define __DAAL_BRNG_MT2203 (1 << 20) * 9 //VSL_BRNG_MT2203 - #define __DAAL_BRNG_MT19937 (1 << 20) * 8 //VSL_BRNG_MT19937 - #define __DAAL_BRNG_MCG59 (1 << 20) * 4 //VSL_BRNG_MCG59 - - #define __DAAL_RNG_METHOD_UNIFORM_STD 0 //VSL_RNG_METHOD_UNIFORM_STD - #define __DAAL_RNG_METHOD_UNIFORMBITS32_STD 4 - #define __DAAL_RNG_METHOD_BERNOULLI_ICDF 0 //VSL_RNG_METHOD_BERNOULLI_ICDF - #define __DAAL_RNG_METHOD_GAUSSIAN_BOXMULLER 0 //VSL_RNG_METHOD_GAUSSIAN_BOXMULLER - #define __DAAL_RNG_METHOD_GAUSSIAN_BOXMULLER2 1 //VSL_RNG_METHOD_GAUSSIAN_BOXMULLER2 - #define __DAAL_RNG_METHOD_GAUSSIAN_ICDF 2 //VSL_RNG_METHOD_GAUSSIAN_ICDF + #define __DAAL_BRNG_MT2203 (1 << 20) * 9 //VSL_BRNG_MT2203 + #define __DAAL_BRNG_MT19937 (1 << 20) * 8 //VSL_BRNG_MT19937 + #define __DAAL_BRNG_MCG59 (1 << 20) * 4 //VSL_BRNG_MCG59 + #define __DAAL_BRNG_MRG32K3A (1 << 20) * 3 //VSL_BRNG_MRG32K3A + #define __DAAL_BRNG_PHILOX4X32X10 (1 << 20) * 16 //VSL_BRNG_PHILOX4X32X10 + + #define __DAAL_RNG_METHOD_UNIFORM_STD 0 //VSL_RNG_METHOD_UNIFORM_STD + #define __DAAL_RNG_METHOD_UNIFORMBITS32_STD 4 + #define __DAAL_RNG_METHOD_BERNOULLI_ICDF 0 //VSL_RNG_METHOD_BERNOULLI_ICDF + #define __DAAL_RNG_METHOD_GAUSSIAN_BOXMULLER 0 //VSL_RNG_METHOD_GAUSSIAN_BOXMULLER + #define __DAAL_RNG_METHOD_GAUSSIAN_BOXMULLER2 1 //VSL_RNG_METHOD_GAUSSIAN_BOXMULLER2 + #define __DAAL_RNG_METHOD_GAUSSIAN_ICDF 2 //VSL_RNG_METHOD_GAUSSIAN_ICDF + + // Errors + #define __DAAL_RNG_ERROR_LEAPFROG_UNSUPPORTED -1002 + #define __DAAL_RNG_ERROR_SKIPAHEAD_UNSUPPORTED -1003 namespace daal { diff --git a/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp b/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp index 4da1866e277..bdcc3f1487a 100644 --- a/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp +++ b/cpp/oneapi/dal/algo/connected_components/backend/cpu/vertex_partitioning_default_kernel.hpp @@ -24,7 +24,7 @@ #include "oneapi/dal/backend/memory.hpp" #include "oneapi/dal/backend/interop/common.hpp" #include "oneapi/dal/table/homogen.hpp" -#include "oneapi/dal/backend/primitives/rng/rng_engine.hpp" +#include "oneapi/dal/backend/primitives/rng/rng.hpp" #include "oneapi/dal/detail/threading.hpp" namespace oneapi::dal::preview::connected_components::backend { @@ -90,9 +90,12 @@ std::int32_t most_frequent_element(const std::atomic *components, const std::int64_t &samples_count = 1024) { std::int32_t *rnd_vertex_ids = allocate(vertex_allocator, samples_count); - dal::backend::primitives::engine eng; - dal::backend::primitives::rng rn_gen; - rn_gen.uniform(samples_count, rnd_vertex_ids, eng.get_state(), 0, vertex_count); + dal::backend::primitives::host_engine eng; + dal::backend::primitives::uniform(samples_count, + rnd_vertex_ids, + eng, + 0, + vertex_count); std::int32_t *root_sample_counts = allocate(vertex_allocator, vertex_count); diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/infer_kernel_impl_dpc.cpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/infer_kernel_impl_dpc.cpp index 19da49ffb74..23bcca39fde 100644 --- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/infer_kernel_impl_dpc.cpp +++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/infer_kernel_impl_dpc.cpp @@ -126,6 +126,7 @@ infer_kernel_impl::predict_by_tree_group_weighted( const Float* cls_prb_list_ptr = class_proba_list.get_data(); Index obs_tree_group_response_count = ctx.class_count * ctx.tree_in_group_count; + de::check_mul_overflow(ctx.row_count, obs_tree_group_response_count); auto [obs_response_list, zero_obs_response_event] = pr::ndarray::zeros(queue_, diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/infer_model_manager.hpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/infer_model_manager.hpp index c0dd4cca565..770f73b5269 100644 --- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/infer_model_manager.hpp +++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/infer_model_manager.hpp @@ -87,7 +87,6 @@ class infer_model_manager { } max_tree_size_ = dal::detail::integral_cast(tree_size_max); - const Index tree_block_size = dal::detail::check_mul_overflow(max_tree_size_, tree_count); auto fi_list_host = dal::backend::primitives::ndarray::empty({ tree_block_size }); diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_feature_type_dpc.cpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_feature_type_dpc.cpp index 7306533ed50..ee17b4ffabd 100644 --- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_feature_type_dpc.cpp +++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_feature_type_dpc.cpp @@ -20,6 +20,7 @@ #include "oneapi/dal/table/row_accessor.hpp" #include "oneapi/dal/backend/memory.hpp" #include "oneapi/dal/detail/profiler.hpp" +#include #ifdef ONEDAL_DATA_PARALLEL @@ -29,6 +30,12 @@ namespace de = dal::detail; namespace bk = dal::backend; namespace pr = dal::backend::primitives; +template +std::int64_t propose_block_size(const sycl::queue& q, const std::int64_t r) { + constexpr std::int64_t fsize = sizeof(Float); + return 0x10000l * (8 / fsize); +} + template inline sycl::event sort_inplace(sycl::queue& queue_, pr::ndarray& src, @@ -56,18 +63,29 @@ sycl::event indexed_features::extract_column( Float* values = values_nd.get_mutable_data(); Index* indices = indices_nd.get_mutable_data(); auto column_count = column_count_; - - const sycl::range<1> range = de::integral_cast(row_count_); - - auto event = queue_.submit([&](sycl::handler& h) { - h.depends_on(deps); - h.parallel_for(range, [=](sycl::id<1> idx) { - values[idx] = data[idx * column_count + feature_id]; - indices[idx] = idx; + const auto block_size = propose_block_size(queue_, row_count_); + const bk::uniform_blocking blocking(row_count_, block_size); + + std::vector events(blocking.get_block_count()); + for (std::int64_t block_index = 0; block_index < blocking.get_block_count(); ++block_index) { + const auto first_row = blocking.get_block_start_index(block_index); + const auto last_row = blocking.get_block_end_index(block_index); + const auto curr_block = last_row - first_row; + ONEDAL_ASSERT(curr_block > 0); + + auto event = queue_.submit([&](sycl::handler& cgh) { + cgh.depends_on(deps); + cgh.parallel_for<>(de::integral_cast(curr_block), [=](sycl::id<1> idx) { + const std::int64_t row = idx + first_row; + + values[row] = data[row * column_count + feature_id]; + indices[row] = row; + }); }); - }); - return event; + events.push_back(event); + } + return bk::wait_or_pass(events); } template diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp index 9dfe252e849..d15f1704065 100644 --- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp +++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2021 Intel Corporation +* Copyright 2021-2022 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,7 +20,8 @@ #include "oneapi/dal/backend/primitives/ndarray.hpp" #include "oneapi/dal/backend/primitives/utils.hpp" #include "oneapi/dal/algo/decision_forest/train_types.hpp" - +#include "oneapi/dal/backend/primitives/rng/rng.hpp" +#include "oneapi/dal/backend/primitives/rng/rng.hpp" #include "oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp" #include "oneapi/dal/algo/decision_forest/backend/gpu/train_misc_structs.hpp" @@ -50,7 +51,7 @@ class train_kernel_hist_impl { using model_manager_t = train_model_manager; using train_context_t = train_context; using imp_data_t = impurity_data; - using rng_engine_t = pr::engine; + using rng_engine_t = pr::dpc_engine; using rng_engine_list_t = std::vector; using msg = dal::detail::error_messages; using comm_t = bk::communicator; @@ -62,7 +63,7 @@ class train_kernel_hist_impl { train_kernel_hist_impl(const bk::context_gpu& ctx) : queue_(ctx.get_queue()), comm_(ctx.get_communicator()), - train_service_kernels_(queue_) {} + train_service_kernels_(ctx.get_queue()) {} ~train_kernel_hist_impl() = default; result_t operator()(const descriptor_t& desc, @@ -83,13 +84,11 @@ class train_kernel_hist_impl { pr::ndarray& node_list, pr::ndarray& tree_order_level, Index engine_offset, - Index node_count); + Index node_count, + const bk::event_vector& deps = {}); void validate_input(const descriptor_t& desc, const table& data, const table& labels) const; - Index get_row_total_count(bool distr_mode, Index row_count); - Index get_global_row_offset(bool distr_mode, Index row_count); - /// Initializes `ctx` training context structure based on data and /// descriptor class. Filling and calculating all parameters in context, /// for example, tree count, required memory size, calculating indexed features, etc. @@ -149,6 +148,24 @@ class train_kernel_hist_impl { Index node_count, const bk::event_vector& deps = {}); + sycl::event compute_initial_imp_for_node_list_regression( + const train_context_t& ctx, + const pr::ndarray& node_list, + const pr::ndarray& local_sum_hist, + const pr::ndarray& local_sum2cent_hist, + imp_data_t& imp_data_list, + Index node_count, + const bk::event_vector& deps = {}); + + sycl::event compute_local_sum_histogram(const train_context_t& ctx, + const pr::ndarray& response, + const pr::ndarray& tree_order, + const pr::ndarray& node_list, + pr::ndarray& local_sum_hist, + pr::ndarray& local_sum2cent_hist, + Index node_count, + const bk::event_vector& deps = {}); + /// Computes initial histograms for each node to compute impurity. /// /// @param[in] ctx a training context structure for a GPU backend @@ -575,7 +592,7 @@ class train_kernel_hist_impl { pr::ndarray& oob_per_obs_list, pr::ndarray& var_imp, pr::ndarray& var_imp_variance, - const rng_engine_list_t& rng_engine_arr, + rng_engine_list_t& rng_engine_arr, Index tree_idx, Index tree_in_block, Index built_tree_count, diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp index fc875683784..69593a81321 100644 --- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp +++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl_dpc.cpp @@ -1,5 +1,5 @@ /******************************************************************************* -* Copyright 2021 Intel Corporation +* Copyright 2021-2022 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,7 +19,7 @@ #include "oneapi/dal/table/row_accessor.hpp" #include "oneapi/dal/detail/profiler.hpp" #include "oneapi/dal/algo/decision_forest/backend/gpu/train_helpers.hpp" - +#include #ifdef ONEDAL_DATA_PARALLEL #include "oneapi/dal/algo/decision_forest/backend/gpu/train_kernel_hist_impl.hpp" @@ -91,47 +91,16 @@ void train_kernel_hist_impl::validate_input(const descr } } -template -Index train_kernel_hist_impl::get_row_total_count(bool distr_mode, - Index row_count) { - Index row_total_count = row_count; - - if (distr_mode) { - ONEDAL_PROFILER_TASK(allreduce_row_total_count); - comm_.allreduce(row_total_count).wait(); - } - - return row_total_count; -} - -template -Index train_kernel_hist_impl::get_global_row_offset(bool distr_mode, - Index row_count) { - Index global_row_offset = 0; - - if (distr_mode) { - auto row_count_list_host = pr::ndarray::empty({ comm_.get_rank_count() }); - Index* row_count_list_host_ptr = row_count_list_host.get_mutable_data(); - { - ONEDAL_PROFILER_TASK(allgather_row_count); - comm_.allgather(row_count, row_count_list_host.flatten()).wait(); - } - - for (std::int64_t i = 0; i < comm_.get_rank(); ++i) { - global_row_offset += row_count_list_host_ptr[i]; - } - } - - return global_row_offset; -} - template void train_kernel_hist_impl::init_params(train_context_t& ctx, const descriptor_t& desc, const table& data, const table& responses, const table& weights) { - ctx.distr_mode_ = (comm_.get_rank_count() > 1); + ONEDAL_PROFILER_TASK(init_params, queue_); + std::int64_t rank_count = comm_.get_rank_count(); + ctx.distr_mode_ = (rank_count > 1); + auto current_rank = comm_.get_rank(); ctx.use_private_mem_buf_ = true; @@ -143,7 +112,11 @@ void train_kernel_hist_impl::init_params(train_context_ } ctx.row_count_ = de::integral_cast(data.get_row_count()); - ctx.row_total_count_ = get_row_total_count(ctx.distr_mode_, ctx.row_count_); + ctx.row_total_count_ = ctx.row_count_; + { + ONEDAL_PROFILER_TASK(allreduce_total_row_count_exactly_it, queue_); + comm_.allreduce(ctx.row_total_count_, spmd::reduce_op::sum).wait(); + } ctx.column_count_ = de::integral_cast(data.get_column_count()); @@ -151,10 +124,22 @@ void train_kernel_hist_impl::init_params(train_context_ ctx.selected_row_count_ = ctx.distr_mode_ ? impl_const_t::bad_val_ : desc.get_observations_per_tree_fraction() * ctx.row_count_; + ctx.selected_row_total_count_ = desc.get_observations_per_tree_fraction() * ctx.row_total_count_; - ctx.global_row_offset_ = get_global_row_offset(ctx.distr_mode_, ctx.row_count_); + auto global_rank_offsets = array::zeros(rank_count); + global_rank_offsets.get_mutable_data()[current_rank] = ctx.row_count_; + { + ONEDAL_PROFILER_TASK(allreduce_recv_counts, queue_); + comm_.allreduce(global_rank_offsets, spmd::reduce_op::sum).wait(); + } + + ctx.global_row_offset_ = 0; + for (std::int64_t i = 0; i < current_rank; i++) { + ONEDAL_ASSERT(global_rank_offsets.get_data()[i] >= 0); + ctx.global_row_offset_ += global_rank_offsets.get_data()[i]; + } ctx.tree_count_ = de::integral_cast(desc.get_tree_count()); @@ -211,7 +196,7 @@ void train_kernel_hist_impl::init_params(train_context_ bin_borders_host_[clmn_idx] = ind_ftrs.get_bin_borders(clmn_idx).to_host(queue_); } - data_host_ = pr::table2ndarray_1d(queue_, data, alloc::device).to_host(queue_); + data_host_ = pr::table2ndarray_1d(queue_, data, alloc::host); response_nd_ = pr::table2ndarray_1d(queue_, responses, alloc::device); @@ -332,7 +317,9 @@ void train_kernel_hist_impl::init_params(train_context_ template void train_kernel_hist_impl::allocate_buffers(const train_context_t& ctx) { - de::check_mul_overflow(ctx.selected_row_total_count_, ctx.tree_in_block_); + ONEDAL_PROFILER_TASK(allocate_buffers, queue_); + de::check_mul_overflow(ctx.selected_row_total_count_, + static_cast(ctx.tree_in_block_)); // main tree order and auxilliary one are used for partitioning tree_order_lev_ = @@ -372,7 +359,8 @@ sycl::event train_kernel_hist_impl::gen_initial_tree_or pr::ndarray& node_list_host, pr::ndarray& tree_order_level, Index engine_offset, - Index node_count) { + Index node_count, + const bk::event_vector& deps) { ONEDAL_PROFILER_TASK(gen_initial_tree_order, queue_); ONEDAL_ASSERT(node_list_host.get_count() == node_count * impl_const_t::node_prop_count_); @@ -382,50 +370,74 @@ sycl::event train_kernel_hist_impl::gen_initial_tree_or sycl::event last_event; if (ctx.bootstrap_) { - auto selected_row_global_host = - pr::ndarray::empty({ ctx.selected_row_total_count_ * ctx.tree_in_block_ }); - pr::ndarray selected_row_host; + auto selected_row_global = + pr::ndarray::empty(queue_, + { ctx.selected_row_total_count_ * ctx.tree_in_block_ }, + alloc::device); + pr::ndarray selected_row; if (ctx.distr_mode_) { - selected_row_host = pr::ndarray::empty( - { ctx.selected_row_total_count_ * ctx.tree_in_block_ }); + selected_row = + pr::ndarray::empty(queue_, + { ctx.selected_row_total_count_ * ctx.tree_in_block_ }, + alloc::device); } - Index* const selected_row_global_ptr = selected_row_global_host.get_mutable_data(); - Index* const selected_row_ptr = - ctx.distr_mode_ ? selected_row_host.get_mutable_data() : nullptr; + Index* const selected_row_global_ptr = selected_row_global.get_mutable_data(); + Index* const selected_row_ptr = ctx.distr_mode_ ? selected_row.get_mutable_data() : nullptr; Index* const node_list_ptr = node_list_host.get_mutable_data(); for (Index node_idx = 0; node_idx < node_count; ++node_idx) { - pr::rng rn_gen; Index* gen_row_idx_global_ptr = selected_row_global_ptr + ctx.selected_row_total_count_ * node_idx; - rn_gen.uniform(ctx.selected_row_total_count_, - gen_row_idx_global_ptr, - rng_engine_list[engine_offset + node_idx].get_state(), - 0, - ctx.row_total_count_); + uniform(queue_, + ctx.selected_row_total_count_, + gen_row_idx_global_ptr, + rng_engine_list[engine_offset + node_idx], + 0, + ctx.row_total_count_, + { deps }); if (ctx.distr_mode_) { Index* node_ptr = node_list_ptr + node_idx * impl_const_t::node_prop_count_; - Index* src = gen_row_idx_global_ptr; Index* const dst = selected_row_ptr + ctx.selected_row_total_count_ * node_idx; - Index row_idx = 0; - for (Index i = 0; i < ctx.selected_row_total_count_; ++i) { - dst[i] = 0; - if (src[i] >= ctx.global_row_offset_ && - src[i] < (ctx.global_row_offset_ + ctx.row_count_)) { - dst[row_idx++] = src[i] - ctx.global_row_offset_; - } - } - node_ptr[impl_const_t::ind_lrc] = row_idx; + auto [row_index, row_index_event] = + pr::ndarray::full(queue_, 1, 0, alloc::device); + row_index_event.wait_and_throw(); + Index* row_idx_ptr = row_index.get_mutable_data(); + const sycl::nd_range<1> nd_range = + bk::make_multiple_nd_range_1d(ctx.selected_row_total_count_, 1); + auto event_ = queue_.submit([&](sycl::handler& cgh) { + cgh.depends_on({ last_event }); + cgh.parallel_for(nd_range, [=](sycl::nd_item<1> id) { + auto idx = id.get_global_id(0); + dst[idx] = 0; + if (gen_row_idx_global_ptr[idx] >= ctx.global_row_offset_ && + gen_row_idx_global_ptr[idx] < + (ctx.global_row_offset_ + ctx.row_count_)) { + sycl::atomic_ref< + Index, + sycl::memory_order::relaxed, + sycl::memory_scope::device, + sycl::access::address_space::ext_intel_global_device_space> + counter_atomic(row_idx_ptr[0]); + auto cur_idx = counter_atomic.fetch_add(1); + dst[cur_idx] = gen_row_idx_global_ptr[idx] - ctx.global_row_offset_; + } + }); + }); + auto set_event = queue_.submit([&](sycl::handler& cgh) { + cgh.depends_on(event_); + cgh.parallel_for(sycl::range<1>{ std::size_t(1) }, [=](sycl::id<1> idx) { + node_ptr[impl_const_t::ind_lrc] = row_idx_ptr[0]; + }); + }); + set_event.wait_and_throw(); } } - last_event = ctx.distr_mode_ - ? tree_order_level.assign_from_host(queue_, selected_row_host) - : tree_order_level.assign_from_host(queue_, selected_row_global_host); + ctx.distr_mode_ ? tree_order_level = selected_row : tree_order_level = selected_row_global; } else { Index row_count = ctx.selected_row_count_; @@ -433,29 +445,30 @@ sycl::event train_kernel_hist_impl::gen_initial_tree_or if (ctx.distr_mode_) { row_count = 0; if (ctx.global_row_offset_ < ctx.selected_row_total_count_) { - row_count = std::min(ctx.selected_row_total_count_ - ctx.global_row_offset_, + row_count = std::min(static_cast(ctx.selected_row_total_count_ - + ctx.global_row_offset_), ctx.row_count_); } // in case of no bootstrap // it is valid case if this worker's rows set wasn't taken for tree build // i.e. row_count can be eq 0 - Index* node_list_ptr = node_list_host.get_mutable_data(); - - for (Index node_idx = 0; node_idx < node_count; ++node_idx) { - Index* node_ptr = node_list_ptr + node_idx * impl_const_t::node_prop_count_; - node_ptr[impl_const_t::ind_lrc] = row_count; + auto set_event = queue_.submit([&](sycl::handler& cgh) { + cgh.parallel_for(sycl::range<1>{ std::size_t(node_count) }, [=](sycl::id<1> idx) { + Index* node_ptr = node_list_ptr + idx * impl_const_t::node_prop_count_; + node_ptr[impl_const_t::ind_lrc] = row_count; + }); + }); + set_event.wait_and_throw(); + + if (row_count > 0) { + last_event = train_service_kernels_.initialize_tree_order(tree_order_level, + node_count, + row_count, + stride); } } - - if (row_count > 0) { - last_event = train_service_kernels_.initialize_tree_order(tree_order_level, - node_count, - row_count, - stride); - } } - return last_event; } @@ -469,7 +482,6 @@ train_kernel_hist_impl::gen_feature_list( ONEDAL_PROFILER_TASK(gen_feature_list, queue_); ONEDAL_ASSERT(node_vs_tree_map_list.get_count() == node_count); - de::check_mul_overflow((node_count + 1), ctx.selected_ftr_count_); // first part is used for features indices, +1 block - part for generator auto selected_features_host = @@ -479,36 +491,43 @@ train_kernel_hist_impl::gen_feature_list( { node_count * ctx.selected_ftr_count_ }, alloc::device); - auto selected_features_host_ptr = selected_features_host.get_mutable_data(); - auto node_vs_tree_map_list_host = node_vs_tree_map_list.to_host(queue_); - pr::rng rn_gen; - auto tree_map_ptr = node_vs_tree_map_list_host.get_mutable_data(); if (ctx.selected_ftr_count_ != ctx.column_count_) { + auto tree_map_ptr = node_vs_tree_map_list_host.get_mutable_data(); + auto selected_features_host_ptr = selected_features_host.get_mutable_data(); for (Index node = 0; node < node_count; ++node) { - rn_gen.uniform_without_replacement( + pr::uniform_without_replacement( + queue_, ctx.selected_ftr_count_, selected_features_host_ptr + node * ctx.selected_ftr_count_, selected_features_host_ptr + (node + 1) * ctx.selected_ftr_count_, - rng_engine_list[tree_map_ptr[node]].get_state(), + rng_engine_list[tree_map_ptr[node]], 0, ctx.column_count_); } + auto event = selected_features_com.assign_from_host(queue_, + selected_features_host_ptr, + selected_features_com.get_count()); + + return std::tuple{ selected_features_com, event }; } else { + sycl::event fill_event; for (Index node = 0; node < node_count; ++node) { - for (Index i = 0; i < ctx.selected_ftr_count_; ++i) { - selected_features_host_ptr[node * ctx.selected_ftr_count_ + i] = i; - } + auto selected_features_host_ptr = selected_features_com.get_mutable_data(); + + fill_event = queue_.submit([&](sycl::handler& cgh) { + cgh.parallel_for( + sycl::range<1>{ std::size_t(ctx.selected_ftr_count_) }, + [=](sycl::id<1> idx) { + selected_features_host_ptr[node * ctx.selected_ftr_count_ + idx] = idx; + }); + }); } - } - auto event = selected_features_com.assign_from_host(queue_, - selected_features_host_ptr, - selected_features_com.get_count()); - - return std::tuple{ selected_features_com, event }; + return std::tuple{ selected_features_com, fill_event }; + } } template @@ -524,7 +543,6 @@ train_kernel_hist_impl::gen_random_thresholds( auto node_vs_tree_map_list_host = node_vs_tree_map.to_host(queue_); - pr::rng rn_gen; auto tree_map_ptr = node_vs_tree_map_list_host.get_mutable_data(); // Create arrays for random generated bins @@ -537,11 +555,12 @@ train_kernel_hist_impl::gen_random_thresholds( // Generate random bins for selected features for (Index node = 0; node < node_count; ++node) { - rn_gen.uniform(ctx.selected_ftr_count_, - random_bins_host_ptr + node * ctx.selected_ftr_count_, - rng_engine_list[tree_map_ptr[node]].get_state(), - 0.0f, - 1.0f); + pr::uniform(queue_, + ctx.selected_ftr_count_, + random_bins_host_ptr + node * ctx.selected_ftr_count_, + rng_engine_list[tree_map_ptr[node]], + 0.0f, + 1.0f); } auto event_rnd_generate = random_bins_com.assign_from_host(queue_, random_bins_host_ptr, random_bins_com.get_count()); @@ -758,54 +777,50 @@ sycl::event train_kernel_hist_impl::compute_initial_imp pr::ndarray& node_list, Index node_count, const bk::event_vector& deps) { + ONEDAL_PROFILER_TASK(compute_initial_imp_for_node_list, queue_); ONEDAL_ASSERT(imp_data_list.imp_list_.get_count() == node_count * impl_const_t::node_imp_prop_count_); - if constexpr (std::is_same_v) { - ONEDAL_ASSERT(imp_data_list.class_hist_list_.get_count() == node_count * ctx.class_count_); - } ONEDAL_ASSERT(node_list.get_count() == node_count * impl_const_t::node_prop_count_); + sycl::event event_; if constexpr (std::is_same_v) { - auto class_hist_list_host = imp_data_list.class_hist_list_.to_host(queue_, deps); - auto imp_list_host = imp_data_list.imp_list_.to_host(queue_); - auto node_list_host = node_list.to_host(queue_); + const Index* class_hist_list_ptr = imp_data_list.class_hist_list_.get_data(); + Float* imp_list_ptr = imp_data_list.imp_list_.get_mutable_data(); + Index* node_list_ptr = node_list.get_mutable_data(); + + // Launch kernel to compute impurity and winning class for each node + auto event_ = queue_.submit([&](sycl::handler& cgh) { + cgh.depends_on(deps); + cgh.parallel_for(sycl::range<1>(node_count), [=](sycl::id<1> idx) { + Index node_idx = idx; + const Index* node_histogram_ptr = class_hist_list_ptr + node_idx * ctx.class_count_; + Float* node_imp_ptr = imp_list_ptr + node_idx * impl_const_t::node_imp_prop_count_; + Index* node_ptr = node_list_ptr + node_idx * impl_const_t::node_prop_count_; - const Index* class_hist_list_host_ptr = class_hist_list_host.get_data(); - Float* imp_list_host_ptr = imp_list_host.get_mutable_data(); - Index* node_list_host_ptr = node_list_host.get_mutable_data(); + Index row_count = node_ptr[impl_const_t::ind_grc]; - for (Index node_idx = 0; node_idx < node_count; ++node_idx) { - const Index* node_histogram_ptr = - class_hist_list_host_ptr + node_idx * ctx.class_count_; - Float* node_imp_ptr = imp_list_host_ptr + node_idx * impl_const_t::node_imp_prop_count_; - Index* node_ptr = node_list_host_ptr + node_idx * impl_const_t::node_prop_count_; - - Index row_count = node_ptr[impl_const_t::ind_grc]; - - Float imp = Float(1); - Float div = Float(1) / (Float(row_count) * row_count); - Index max_cls_count = 0; - Index win_cls = 0; - Index cls_count = 0; - - for (Index cls_idx = 0; cls_idx < ctx.class_count_; ++cls_idx) { - cls_count = node_histogram_ptr[cls_idx]; - imp -= Float(cls_count) * (cls_count)*div; - - if (cls_count > max_cls_count) { - max_cls_count = cls_count; - win_cls = cls_idx; - } - } + Float imp = Float(1); + Float div = Float(1) / (Float(row_count) * row_count); + Index max_cls_count = 0; + Index win_cls = 0; + Index cls_count = 0; - node_ptr[impl_const_t::ind_win] = win_cls; - node_imp_ptr[0] = sycl::max(imp, Float(0)); - } - imp_data_list.imp_list_.assign_from_host(queue_, imp_list_host).wait_and_throw(); - node_list.assign_from_host(queue_, node_list_host).wait_and_throw(); + for (Index cls_idx = 0; cls_idx < ctx.class_count_; ++cls_idx) { + cls_count = node_histogram_ptr[cls_idx]; + imp -= cls_count * cls_count * div; + + if (cls_count > max_cls_count) { + max_cls_count = cls_count; + win_cls = cls_idx; + } + } + node_ptr[impl_const_t::ind_win] = win_cls; + node_imp_ptr[0] = sycl::max(imp, Float(0)); + }); + }); } - return sycl::event{}; + return event_; } template @@ -994,6 +1009,136 @@ Float* local_buf_ptr = local_buf.get_pointer().get(); return event; } +template +sycl::event train_kernel_hist_impl::compute_local_sum_histogram( + const train_context_t& ctx, + const pr::ndarray& response, + const pr::ndarray& tree_order, + const pr::ndarray& node_list, + pr::ndarray& local_sum_hist, + pr::ndarray& local_sum2cent_hist, + Index node_count, + const bk::event_vector& deps) { + ONEDAL_ASSERT(response.get_count() == ctx.row_count_); + ONEDAL_ASSERT(tree_order.get_count() == ctx.tree_in_block_ * ctx.selected_row_total_count_); + ONEDAL_ASSERT(node_list.get_count() == node_count * impl_const_t::node_prop_count_); + ONEDAL_ASSERT(local_sum_hist.get_count() == node_count); + ONEDAL_ASSERT(local_sum2cent_hist.get_count() == node_count); + + auto fill_event1 = local_sum_hist.fill(queue_, 0, deps); + auto fill_event2 = local_sum2cent_hist.fill(queue_, 0, deps); + + fill_event1.wait_and_throw(); + fill_event2.wait_and_throw(); + + const Float* response_ptr = response.get_data(); + const Index* tree_order_ptr = tree_order.get_data(); + const Index* node_list_ptr = node_list.get_data(); + Float* local_sum_hist_ptr = local_sum_hist.get_mutable_data(); + Float* local_sum2cent_hist_ptr = local_sum2cent_hist.get_mutable_data(); + + const Index node_prop_count = impl_const_t::node_prop_count_; + + auto local_size = ctx.preferable_group_size_; + const sycl::nd_range<2> nd_range = + bk::make_multiple_nd_range_2d({ local_size, node_count }, { local_size, 1 }); + + auto event = queue_.submit([&](sycl::handler& cgh) { + cgh.depends_on(deps); + local_accessor_rw_t local_sum_buf(local_size, cgh); + local_accessor_rw_t local_sum2cent_buf(local_size, cgh); + cgh.parallel_for(nd_range, [=](sycl::nd_item<2> item) { + const Index node_id = item.get_global_id()[1]; + const Index local_id = item.get_local_id()[0]; + const Index local_size = item.get_local_range()[0]; + + const Index* node_ptr = node_list_ptr + node_id * node_prop_count; + + const Index row_offset = node_ptr[impl_const_t::ind_ofs]; + const Index row_count = node_ptr[impl_const_t::ind_lrc]; + + const Index* node_tree_order_ptr = &tree_order_ptr[row_offset]; +#if __SYCL_COMPILER_VERSION >= 20230828 + Float* local_sum_buf_ptr = + local_sum_buf.template get_multi_ptr().get_raw(); + Float* local_sum2cent_buf_ptr = + local_sum2cent_buf.template get_multi_ptr().get_raw(); +#else + Float* local_sum_buf_ptr = local_sum_buf.get_pointer().get(); + Float* local_sum2cent_buf_ptr = local_sum2cent_buf.get_pointer().get(); +#endif + Float local_sum = Float(0); + Float local_sum2cent = Float(0); + for (Index i = local_id; i < row_count; i += local_size) { + Float value = response_ptr[node_tree_order_ptr[i]]; + local_sum += value; + local_sum2cent += value * value; + } + + local_sum_buf_ptr[local_id] = local_sum; + local_sum2cent_buf_ptr[local_id] = local_sum2cent; + + for (Index offset = local_size / 2; offset > 0; offset >>= 1) { + item.barrier(sycl::access::fence_space::local_space); + if (local_id < offset) { + local_sum_buf_ptr[local_id] += local_sum_buf_ptr[local_id + offset]; + local_sum2cent_buf_ptr[local_id] += local_sum2cent_buf_ptr[local_id + offset]; + } + } + + if (local_id == 0) { + local_sum_hist_ptr[node_id] = local_sum_buf_ptr[local_id]; + local_sum2cent_hist_ptr[node_id] = local_sum2cent_buf_ptr[local_id]; + } + }); + }); + + event.wait_and_throw(); + return event; +} + +template +sycl::event +train_kernel_hist_impl::compute_initial_imp_for_node_list_regression( + const train_context_t& ctx, + const pr::ndarray& node_list, + const pr::ndarray& local_sum_hist, + const pr::ndarray& local_sum2cent_hist, + imp_data_t& imp_data_list, + Index node_count, + const bk::event_vector& deps) { + ONEDAL_ASSERT(node_list.get_count() == node_count * impl_const_t::node_prop_count_); + ONEDAL_ASSERT(local_sum_hist.get_count() == node_count); + ONEDAL_ASSERT(local_sum2cent_hist.get_count() == node_count); + ONEDAL_ASSERT(imp_data_list.imp_list_.get_count() == + node_count * impl_const_t::node_imp_prop_count_); + + const Index* node_list_ptr = node_list.get_data(); + const Float* local_sum_hist_ptr = local_sum_hist.get_data(); + const Float* local_sum2cent_hist_ptr = local_sum2cent_hist.get_data(); + Float* imp_list_ptr = imp_data_list.imp_list_.get_mutable_data(); + + const sycl::range<1> range{ de::integral_cast(node_count) }; + + auto last_event = queue_.submit([&](sycl::handler& cgh) { + cgh.depends_on(deps); + cgh.parallel_for(range, [=](sycl::id<1> node_idx) { + // set mean + imp_list_ptr[node_idx * impl_const_t::node_imp_prop_count_ + 0] = + local_sum_hist_ptr[node_idx] / + node_list_ptr[node_idx * impl_const_t::node_prop_count_ + impl_const_t::ind_grc]; + // set sum2cent + imp_list_ptr[node_idx * impl_const_t::node_imp_prop_count_ + 1] = + local_sum2cent_hist_ptr[node_idx] - + (local_sum_hist_ptr[node_idx] * local_sum_hist_ptr[node_idx]) / + node_list_ptr[node_idx * impl_const_t::node_prop_count_ + + impl_const_t::ind_grc]; + }); + }); + + return last_event; +} + template sycl::event train_kernel_hist_impl::compute_initial_sum2cent_local( const train_context_t& ctx, @@ -1135,8 +1280,8 @@ sycl::event train_kernel_hist_impl::compute_initial_his sycl::event last_event; - if (ctx.distr_mode_) { - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { + if (ctx.distr_mode_) { last_event = compute_initial_histogram_local(ctx, response, tree_order, @@ -1156,49 +1301,44 @@ sycl::event train_kernel_hist_impl::compute_initial_his { last_event }); } else { - auto sum_list = pr::ndarray::empty(queue_, { node_count }); - auto sum2cent_list = pr::ndarray::empty(queue_, { node_count }); - last_event = compute_initial_sum_local(ctx, - response, - tree_order, - node_list, - sum_list, - node_count, - deps); - { - ONEDAL_PROFILER_TASK(sum_list, queue_); - comm_.allreduce(sum_list.flatten(queue_, { last_event })).wait(); - } - last_event = compute_initial_sum2cent_local(ctx, - response, - tree_order, - node_list, - sum_list, - sum2cent_list, - node_count, - { last_event }); - { - ONEDAL_PROFILER_TASK(allreduce_sum2cent_list, queue_); - comm_.allreduce(sum2cent_list.flatten(queue_, { last_event })).wait(); - } - last_event = fin_initial_imp(ctx, - node_list, - sum_list, - sum2cent_list, - imp_data_list, - node_count, - { last_event }); + last_event = compute_initial_histogram_local(ctx, + response, + tree_order, + node_list, + imp_data_list, + node_count, + deps); last_event.wait_and_throw(); } } else { - last_event = compute_initial_histogram_local(ctx, - response, - tree_order, - node_list, - imp_data_list, - node_count, - deps); + auto local_sum_hist = pr::ndarray::empty(queue_, { node_count }); + auto local_sum2cent_hist = pr::ndarray::empty(queue_, { node_count }); + + last_event = compute_local_sum_histogram(ctx, + response, + tree_order, + node_list, + local_sum_hist, + local_sum2cent_hist, + node_count, + deps); + { + ONEDAL_PROFILER_TASK(allreduce_sum_hist, queue_); + comm_.allreduce(local_sum_hist.flatten(queue_, { last_event })).wait(); + } + { + ONEDAL_PROFILER_TASK(allreduce_sum2cent_hist, queue_); + comm_.allreduce(local_sum2cent_hist.flatten(queue_, { last_event })).wait(); + } + + last_event = compute_initial_imp_for_node_list_regression(ctx, + node_list, + local_sum_hist, + local_sum2cent_hist, + imp_data_list, + node_count, + { last_event }); last_event.wait_and_throw(); } @@ -1409,8 +1549,6 @@ sycl::event train_kernel_hist_impl::do_node_split( const Index* node_list_ptr = node_list.get_data(); const Index* node_vs_tree_map_list_ptr = node_vs_tree_map_list.get_data(); - const bool distr_mode = ctx.distr_mode_; - Index* node_list_new_ptr = node_list_new.get_mutable_data(); Index* node_vs_tree_map_list_new_ptr = node_vs_tree_map_list_new.get_mutable_data(); @@ -1449,7 +1587,7 @@ sycl::event train_kernel_hist_impl::do_node_split( Index* node_rch = node_list_new_ptr + (new_left_node_pos + 1) * node_prop_count; node_lch[impl_const_t::ind_ofs] = node_prn[impl_const_t::ind_ofs]; - node_lch[impl_const_t::ind_lrc] = distr_mode + node_lch[impl_const_t::ind_lrc] = ctx.distr_mode_ ? node_prn[impl_const_t::ind_lch_lrc] : node_prn[impl_const_t::ind_lch_grc]; node_lch[impl_const_t::ind_grc] = node_prn[impl_const_t::ind_lch_grc]; @@ -1613,7 +1751,7 @@ sycl::event train_kernel_hist_impl::compute_results( pr::ndarray& oob_per_obs_list, pr::ndarray& var_imp, pr::ndarray& var_imp_variance, - const rng_engine_list_t& engine_arr, + rng_engine_list_t& engine_arr, Index tree_idx_in_block, Index tree_in_block_count, Index built_tree_count, @@ -1660,12 +1798,12 @@ sycl::event train_kernel_hist_impl::compute_results( const Float div1 = Float(1) / Float(built_tree_count + tree_idx_in_block + 1); - pr::rng rn_gen; - for (Index column_idx = 0; column_idx < ctx.column_count_; ++column_idx) { - rn_gen.shuffle(oob_row_count, - permutation_ptr, - engine_arr[built_tree_count + tree_idx_in_block].get_state()); + pr::shuffle(queue_, + oob_row_count, + permutation_ptr, + engine_arr[built_tree_count + tree_idx_in_block], + {}); const Float oob_err_perm = compute_oob_error_perm(ctx, model_manager, data_host, @@ -1858,10 +1996,12 @@ train_result train_kernel_hist_impl::operator()( de::check_mul_overflow((ctx.tree_count_ - 1), skip_num); - pr::engine_collection collection(ctx.tree_count_, desc.get_seed()); - rng_engine_list_t engine_arr = collection([&](std::size_t i, std::size_t& skip) { - skip = i * skip_num; - }); + pr::engine_collection_oneapi collection( + queue_, + ctx.tree_count_, + desc.get_seed()); + + rng_engine_list_t engine_arr = collection.get_engines(); pr::ndarray node_imp_decrease_list; @@ -1882,49 +2022,55 @@ train_result train_kernel_hist_impl::operator()( imp_data_mng_t imp_data_holder(queue_, ctx); // initilizing imp_list and class_hist_list (for classification) imp_data_holder.init_new_level(node_count); - de::check_mul_overflow(node_count, impl_const_t::node_prop_count_); de::check_mul_overflow(node_count, impl_const_t::node_imp_prop_count_); - auto node_vs_tree_map_list_host = pr::ndarray::empty({ node_count }); - auto level_node_list_init_host = - pr::ndarray::empty({ node_count * impl_const_t::node_prop_count_ }); - - auto tree_map = node_vs_tree_map_list_host.get_mutable_data(); - auto node_list_ptr = level_node_list_init_host.get_mutable_data(); - - for (Index node = 0; node < node_count; ++node) { - Index* node_ptr = node_list_ptr + node * impl_const_t::node_prop_count_; - tree_map[node] = iter + node; - node_ptr[impl_const_t::ind_ofs] = - ctx.selected_row_total_count_ * node; // local row offset - node_ptr[impl_const_t::ind_lrc] = - ctx.distr_mode_ - ? 0 - : ctx.selected_row_count_; // for distr_mode it will be updated during gen_initial_tree_order - node_ptr[impl_const_t::ind_grc] = - ctx.selected_row_total_count_; // global selected rows - it is already filtered for current block - node_ptr[impl_const_t::ind_lch_lrc] = - 0; // for distr_mode it will be updated during tree_order_gen - node_ptr[impl_const_t::ind_fid] = impl_const_t::bad_val_; - } + auto node_vs_tree_map_list = + pr::ndarray::empty(queue_, { node_count }, alloc::device); + auto level_node_list_init = + pr::ndarray::empty(queue_, + { node_count * impl_const_t::node_prop_count_ }, + alloc::device); + + auto tree_map = node_vs_tree_map_list.get_mutable_data(); + auto node_list_ptr = level_node_list_init.get_mutable_data(); + + auto fill_event = queue_.submit([&](sycl::handler& cgh) { + cgh.depends_on({ last_event }); + cgh.parallel_for(sycl::range<1>{ std::size_t(node_count) }, [=](sycl::id<1> node) { + Index* node_ptr = node_list_ptr + node * impl_const_t::node_prop_count_; + tree_map[node] = iter + node; + node_ptr[impl_const_t::ind_ofs] = + ctx.selected_row_total_count_ * node; // local row offset + node_ptr[impl_const_t::ind_lrc] = + ctx.distr_mode_ + ? 0 + : ctx.selected_row_count_; // for distr_mode it will be updated during gen_initial_tree_order + node_ptr[impl_const_t::ind_grc] = + ctx.selected_row_total_count_; // global selected rows - it is already filtered for current block + node_ptr[impl_const_t::ind_lch_lrc] = + 0; // for distr_mode it will be updated during tree_order_gen + node_ptr[impl_const_t::ind_fid] = impl_const_t::bad_val_; + }); + }); - last_event = gen_initial_tree_order(ctx, - engine_arr, - level_node_list_init_host, - tree_order_lev_, - iter, - node_count); - - auto node_vs_tree_map_list = node_vs_tree_map_list_host.to_device(queue_); - level_node_lists.push_back(level_node_list_init_host.to_device(queue_)); - - last_event = compute_initial_histogram(ctx, - response_nd_, - tree_order_lev_, - level_node_lists[0], - imp_data_holder.get_mutable_data(0), - node_count, - { last_event }); + auto gen_initial_tree_order_event = gen_initial_tree_order(ctx, + engine_arr, + level_node_list_init, + tree_order_lev_, + iter, + node_count, + { fill_event }); + + level_node_lists.push_back(level_node_list_init); + + auto compute_initial_histogram_event = + compute_initial_histogram(ctx, + response_nd_, + tree_order_lev_, + level_node_lists[0], + imp_data_holder.get_mutable_data(0), + node_count, + { gen_initial_tree_order_event }); last_event.wait_and_throw(); if (ctx.oob_required_) { @@ -2000,7 +2146,6 @@ train_result train_kernel_hist_impl::operator()( if (node_count_new) { //there are split nodes -> next level is required node_count_new *= 2; - de::check_mul_overflow(node_count_new, impl_const_t::node_prop_count_); auto node_list_new = pr::ndarray::empty( queue_, diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_misc_structs.hpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_misc_structs.hpp index 96e4552ea1b..360c1ba1065 100644 --- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_misc_structs.hpp +++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_misc_structs.hpp @@ -119,7 +119,7 @@ struct train_context { Index selected_ftr_count_ = 0; Index selected_row_count_ = 0; - Index selected_row_total_count_ = 0; + std::int64_t selected_row_total_count_ = 0; Index min_observations_in_leaf_node_ = 0; Index max_tree_depth_ = 0; diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_service_kernels_dpc.cpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_service_kernels_dpc.cpp index 11549f3d62d..0de8dd1d6f6 100644 --- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_service_kernels_dpc.cpp +++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_service_kernels_dpc.cpp @@ -19,6 +19,7 @@ #include "oneapi/dal/table/row_accessor.hpp" #include "oneapi/dal/detail/profiler.hpp" #include "oneapi/dal/algo/decision_forest/backend/gpu/train_helpers.hpp" +#include #ifdef ONEDAL_DATA_PARALLEL diff --git a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_splitter_impl_dpc.cpp b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_splitter_impl_dpc.cpp index eeaafe2a179..d7ad6137288 100644 --- a/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_splitter_impl_dpc.cpp +++ b/cpp/oneapi/dal/algo/decision_forest/backend/gpu/train_splitter_impl_dpc.cpp @@ -19,7 +19,7 @@ #include "oneapi/dal/table/row_accessor.hpp" #include "oneapi/dal/detail/profiler.hpp" #include "oneapi/dal/algo/decision_forest/backend/gpu/train_helpers.hpp" - +#include #ifdef ONEDAL_DATA_PARALLEL #include "oneapi/dal/algo/decision_forest/backend/gpu/train_splitter_impl.hpp" @@ -134,7 +134,6 @@ sycl::event train_splitter_impl::random_split( const auto nd_range = bk::make_multiple_nd_range_2d({ local_size, node_in_block_count }, { local_size, 1 }); - sycl::event last_event = queue.submit([&](sycl::handler& cgh) { cgh.depends_on(deps); local_accessor_rw_t local_hist_buf(hist_size, cgh); @@ -537,7 +536,19 @@ sycl::event train_splitter_impl::best_split( const Index bin_block = compute_bin_block_size(queue, hist_prop_count, bin_count); - const Index local_size = bk::device_max_wg_size(queue); + const Index local_size_initial = bk::device_max_wg_size(queue); + Index local_size = local_size_initial; + const auto max_int_limit = std::numeric_limits::max(); + + if (node_count * ftr_count > 0 && node_count * ftr_count <= max_int_limit) { + while (node_count * ftr_count * local_size > max_int_limit) { + local_size /= 2; + } + } + else { + std::cerr << "Error: node_count * ftr_count exceeds int limit" << std::endl; + } + const auto nd_range = bk::make_multiple_nd_range_3d({ node_count, ftr_count, local_size }, { 1, 1, local_size }); @@ -686,7 +697,6 @@ sycl::event train_splitter_impl::best_split( } }); }); - // Merging kernel: selects best split among all features. const auto merge_range = bk::make_multiple_nd_range_2d({ node_count, local_size }, { 1, local_size }); diff --git a/cpp/oneapi/dal/algo/decision_forest/test/spmd.cpp b/cpp/oneapi/dal/algo/decision_forest/test/spmd.cpp index 534acddb04a..fe99f8f8d4b 100644 --- a/cpp/oneapi/dal/algo/decision_forest/test/spmd.cpp +++ b/cpp/oneapi/dal/algo/decision_forest/test/spmd.cpp @@ -209,7 +209,7 @@ DF_SPMD_CLS_TEST_NIGHTLY_EXT("df cls default flow") { desc.set_max_tree_depth(max_tree_depth_val); desc.set_class_count(wl.ds_info.class_count); - this->set_rank_count(2); + this->set_rank_count(4); const auto train_result = this->train_spmd_base_checks(desc, data, this->get_homogen_table_id()); const auto model = train_result.get_model(); @@ -233,7 +233,7 @@ DF_SPMD_CLS_TEST_EXT("df cls corner flow") { desc.set_min_observations_in_leaf_node(8); desc.set_class_count(wl.ds_info.class_count); - this->set_rank_count(2); + this->set_rank_count(4); const auto train_result = this->train_spmd_base_checks(desc, data, this->get_homogen_table_id()); const auto model = train_result.get_model(); @@ -257,7 +257,7 @@ DF_SPMD_CLS_TEST_EXT("df cls small flow") { desc.set_tree_count(tree_count); desc.set_class_count(wl.ds_info.class_count); - this->set_rank_count(2); + this->set_rank_count(4); const auto train_result = this->train_spmd_base_checks(desc, data, this->get_homogen_table_id()); const auto model = train_result.get_model(); @@ -288,7 +288,7 @@ DF_SPMD_CLS_TEST_NIGHTLY_EXT("df cls impurity flow") { desc.set_impurity_threshold(impurity_threshold_val); desc.set_class_count(wl.ds_info.class_count); - this->set_rank_count(2); + this->set_rank_count(4); const auto train_result = this->train_spmd_base_checks(desc, data, this->get_homogen_table_id()); const auto model = train_result.get_model(); @@ -317,7 +317,7 @@ DF_SPMD_CLS_TEST_NIGHTLY_EXT("df cls all features flow") { desc.set_features_per_node(data.get_column_count() - 1); // skip responses column desc.set_class_count(wl.ds_info.class_count); - this->set_rank_count(2); + this->set_rank_count(4); const auto train_result = this->train_spmd_base_checks(desc, data, this->get_homogen_table_id()); const auto model = train_result.get_model(); @@ -342,7 +342,7 @@ DF_SPMD_CLS_TEST_NIGHTLY_EXT("df cls bootstrap flow") { desc.set_max_tree_depth(50); desc.set_class_count(wl.ds_info.class_count); - this->set_rank_count(2); + this->set_rank_count(4); const auto train_result = this->train_spmd_base_checks(desc, data, this->get_homogen_table_id()); const auto model = train_result.get_model(); @@ -372,7 +372,7 @@ DF_SPMD_CLS_TEST_NIGHTLY_EXT("df cls oob per observation flow") { desc.set_observations_per_tree_fraction(observations_per_tree_fraction_val); desc.set_class_count(wl.ds_info.class_count); - this->set_rank_count(2); + this->set_rank_count(4); const auto train_result = this->train_spmd_base_checks(desc, data, this->get_homogen_table_id()); const auto model = train_result.get_model(); @@ -393,7 +393,7 @@ DF_SPMD_CLS_TEST("df cls base check with default params") { desc.set_class_count(class_count); - this->set_rank_count(2); + this->set_rank_count(4); const auto train_result = this->train_spmd_base_checks(desc, data, this->get_homogen_table_id()); const auto model = train_result.get_model(); @@ -411,7 +411,7 @@ DF_SPMD_CLS_TEST("df cls base check with default params and train weights") { desc.set_class_count(class_count); - this->set_rank_count(2); + this->set_rank_count(4); const auto train_result = this->train_spmd_weighted_base_checks(desc, data, this->get_homogen_table_id()); const auto model = train_result.get_model(); @@ -444,7 +444,7 @@ DF_SPMD_CLS_TEST("df cls base check with non default params") { desc.set_voting_mode(df::voting_mode::unweighted); desc.set_class_count(class_count); - this->set_rank_count(2); + this->set_rank_count(4); const auto train_result = this->train_spmd_base_checks(desc, data, this->get_homogen_table_id()); const auto model = train_result.get_model(); @@ -462,7 +462,7 @@ DF_SPMD_REG_TEST("df reg base check with default params") { auto desc = this->get_default_descriptor(); - this->set_rank_count(2); + this->set_rank_count(4); const auto train_result = this->train_spmd_base_checks(desc, data, this->get_homogen_table_id()); const auto model = train_result.get_model(); @@ -478,7 +478,7 @@ DF_SPMD_REG_TEST("df reg base check with default params and train weights") { auto desc = this->get_default_descriptor(); - this->set_rank_count(2); + this->set_rank_count(4); const auto train_result = this->train_spmd_weighted_base_checks(desc, data, this->get_homogen_table_id()); const auto model = train_result.get_model(); @@ -505,7 +505,7 @@ DF_SPMD_REG_TEST("df reg base check with non default params") { desc.set_variable_importance_mode(variable_importance_mode_val); desc.set_error_metric_mode(error_metric_mode_val); - this->set_rank_count(2); + this->set_rank_count(4); const auto train_result = this->train_spmd_base_checks(desc, data, this->get_homogen_table_id()); const auto model = train_result.get_model(); @@ -524,7 +524,7 @@ DF_SPMD_REG_TEST_NIGHTLY_EXT("df reg default flow") { auto desc = this->get_default_descriptor(); - this->set_rank_count(2); + this->set_rank_count(4); const auto train_result = this->train_spmd_base_checks(desc, data, this->get_homogen_table_id()); const auto model = train_result.get_model(); @@ -547,7 +547,7 @@ DF_SPMD_REG_TEST_EXT("df reg small flow") { desc.set_tree_count(tree_count); desc.set_min_observations_in_leaf_node(1); - this->set_rank_count(2); + this->set_rank_count(4); const auto train_result = this->train_spmd_base_checks(desc, data, this->get_homogen_table_id()); const auto model = train_result.get_model(); @@ -572,7 +572,7 @@ DF_SPMD_REG_TEST_NIGHTLY_EXT("df reg impurity flow") { desc.set_min_observations_in_leaf_node(min_observations_in_leaf_node); desc.set_impurity_threshold(impurity_threshold_val); - this->set_rank_count(2); + this->set_rank_count(4); const auto train_result = this->train_spmd_base_checks(desc, data, this->get_homogen_table_id()); const auto model = train_result.get_model(); @@ -599,7 +599,7 @@ DF_SPMD_REG_TEST_NIGHTLY_EXT("df reg bootstrap flow") { desc.set_max_tree_depth(max_tree_depth_val); desc.set_bootstrap(bootstrap_val); - this->set_rank_count(2); + this->set_rank_count(4); const auto train_result = this->train_spmd_base_checks(desc, data, this->get_homogen_table_id()); const auto model = train_result.get_model(); diff --git a/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp b/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp index d21de8c9627..ecd49784378 100644 --- a/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp +++ b/cpp/oneapi/dal/algo/louvain/backend/cpu/louvain_data.hpp @@ -17,7 +17,7 @@ #pragma once #include "oneapi/dal/backend/memory.hpp" -#include "oneapi/dal/backend/primitives/rng/rng_engine.hpp" +#include "oneapi/dal/backend/primitives/rng/rng.hpp" namespace oneapi::dal::preview::louvain::backend { using namespace oneapi::dal::preview::detail; @@ -123,8 +123,7 @@ struct louvain_data { // Total link weight in the network value_type m; - engine eng; - rng rn_gen; + host_engine eng; const std::int64_t vertex_count; const std::int64_t edge_count; diff --git a/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp b/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp index 79e294e9f47..e287c3f2f66 100644 --- a/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp +++ b/cpp/oneapi/dal/algo/louvain/backend/cpu/vertex_partitioning_default_kernel.hpp @@ -206,7 +206,7 @@ inline Float move_nodes(const dal::preview::detail::topology& t, ld.random_order[index] = index; } // random shuffle - ld.rn_gen.uniform(t._vertex_count, ld.index, ld.eng.get_state(), 0, t._vertex_count); + uniform(t._vertex_count, ld.index, ld.eng, 0, t._vertex_count); for (std::int64_t index = 0; index < t._vertex_count; ++index) { std::swap(ld.random_order[index], ld.random_order[ld.index[index]]); } diff --git a/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp b/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp index d22a2dde0a1..6cf2b73ccd6 100644 --- a/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp +++ b/cpp/oneapi/dal/backend/primitives/objective_function/test/fixture.hpp @@ -25,7 +25,7 @@ #include "oneapi/dal/table/csr_accessor.hpp" #include "oneapi/dal/detail/debug.hpp" -#include "oneapi/dal/backend/primitives/rng/rng_engine.hpp" +#include "oneapi/dal/backend/primitives/rng/rng.hpp" namespace oneapi::dal::backend::primitives::test { @@ -572,13 +572,12 @@ class logloss_test : public te::float_algo_fixture rn_gen; auto vec_host = ndarray::empty(this->get_queue(), { dim }, sycl::usm::alloc::host); for (std::int32_t ij = 0; ij < num_checks; ++ij) { - primitives::engine eng(2007 + dim * num_checks + ij); - rn_gen.uniform(dim, vec_host.get_mutable_data(), eng.get_state(), -1.0, 1.0); + primitives::host_engine eng(2007 + dim * num_checks + ij); + pr::uniform(dim, vec_host.get_mutable_data(), eng, -1.0, 1.0); auto vec_gpu = vec_host.to_device(this->get_queue()); auto out_vector = ndarray::empty(this->get_queue(), { dim }, sycl::usm::alloc::device); diff --git a/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp b/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp index e902dd452e1..63ab0a07c13 100644 --- a/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp +++ b/cpp/oneapi/dal/backend/primitives/objective_function/test/spmd_fixture.hpp @@ -100,12 +100,12 @@ class logloss_spmd_test : public logloss_test { std::int64_t num_checks = 5; std::vector> vecs_host(num_checks), vecs_gpu(num_checks); - rng rn_gen; + for (std::int64_t ij = 0; ij < num_checks; ++ij) { - engine eng(2007 + dim * num_checks + ij); + host_engine eng(2007 + dim * num_checks + ij); vecs_host[ij] = (ndarray::empty(this->get_queue(), { dim }, sycl::usm::alloc::host)); - rn_gen.uniform(dim, vecs_host[ij].get_mutable_data(), eng.get_state(), -1.0, 1.0); + uniform(dim, vecs_host[ij].get_mutable_data(), eng, -1.0, 1.0); vecs_gpu[ij] = vecs_host[ij].to_device(this->get_queue()); } diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp index ea320f690a2..b529836f70e 100644 --- a/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/cg_solver_dpc.cpp @@ -20,7 +20,7 @@ #include "oneapi/dal/test/engine/common.hpp" #include "oneapi/dal/test/engine/fixtures.hpp" #include "oneapi/dal/table/row_accessor.hpp" -#include "oneapi/dal/backend/primitives/rng/rng_engine.hpp" +#include "oneapi/dal/backend/primitives/rng/rng.hpp" #include namespace oneapi::dal::backend::primitives::test { @@ -43,9 +43,8 @@ class cg_solver_test : public te::float_algo_fixture { x_host_ = ndarray::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host); b_host_ = ndarray::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host); - primitives::rng rn_gen; - primitives::engine eng(4014 + n_); - rn_gen.uniform(n_, x_host_.get_mutable_data(), eng.get_state(), -1.0, 1.0); + primitives::host_engine eng(4014 + n_); + primitives::uniform(n_, x_host_.get_mutable_data(), eng, -1.0, 1.0); create_stable_matrix(this->get_queue(), A_host_); diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp index a6b87b2dcc1..c188c50983c 100644 --- a/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp +++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/fixture.hpp @@ -21,7 +21,7 @@ #include "oneapi/dal/backend/primitives/ndarray.hpp" #include "oneapi/dal/test/engine/common.hpp" #include "oneapi/dal/test/engine/fixtures.hpp" -#include "oneapi/dal/backend/primitives/rng/rng_engine.hpp" +#include "oneapi/dal/backend/primitives/rng/rng.hpp" #include "oneapi/dal/backend/primitives/blas/gemv.hpp" #include "oneapi/dal/backend/primitives/element_wise.hpp" @@ -133,11 +133,10 @@ void create_stable_matrix(sycl::queue& queue, ONEDAL_ASSERT(A.get_dimension(1) == n); auto J = ndarray::empty(queue, { n, n }, sycl::usm::alloc::host); auto eigen_values = ndarray::empty(queue, { n }, sycl::usm::alloc::host); - primitives::rng rn_gen; - primitives::engine eng(2007 + n); + primitives::host_engine eng(2007 + n); - rn_gen.uniform(n * n, J.get_mutable_data(), eng.get_state(), -1.0, 1.0); - rn_gen.uniform(n, eigen_values.get_mutable_data(), eng.get_state(), bottom_eig, top_eig); + primitives::uniform(n * n, J.get_mutable_data(), eng, -1.0, 1.0); + primitives::uniform(n, eigen_values.get_mutable_data(), eng, bottom_eig, top_eig); // orthogonalize matrix J gram_schmidt(J); diff --git a/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp b/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp index f473dddf1f7..b2ebe9f5bdb 100644 --- a/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp +++ b/cpp/oneapi/dal/backend/primitives/optimizers/test/newton_cg_dpc.cpp @@ -22,7 +22,7 @@ #include "oneapi/dal/test/engine/common.hpp" #include "oneapi/dal/test/engine/fixtures.hpp" #include "oneapi/dal/table/row_accessor.hpp" -#include "oneapi/dal/backend/primitives/rng/rng_engine.hpp" +#include "oneapi/dal/backend/primitives/rng/rng.hpp" #include #include "oneapi/dal/backend/primitives/objective_function.hpp" @@ -56,10 +56,10 @@ class newton_cg_test : public te::float_algo_fixture { ndarray::empty(this->get_queue(), { n_ + 1 }, sycl::usm::alloc::host); auto params_host = ndarray::empty(this->get_queue(), { p_ + 1 }, sycl::usm::alloc::host); - primitives::rng rn_gen; - primitives::engine eng(2007 + n); - rn_gen.uniform(n_ * p_, X_host.get_mutable_data(), eng.get_state(), -10.0, 10.0); - rn_gen.uniform(p_ + 1, params_host.get_mutable_data(), eng.get_state(), -5.0, 5.0); + + primitives::host_engine eng(2007 + n); + primitives::uniform(n_ * p_, X_host.get_mutable_data(), eng, -10.0, 10.0); + primitives::uniform(p_ + 1, params_host.get_mutable_data(), eng, -5.0, 5.0); for (std::int64_t i = 0; i < n_; ++i) { float_t val = 0; for (std::int64_t j = 0; j < p_; ++j) { @@ -142,9 +142,9 @@ class newton_cg_test : public te::float_algo_fixture { ndarray::empty(this->get_queue(), { n_, n_ }, sycl::usm::alloc::host); solution_ = ndarray::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host); auto b_host = ndarray::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host); - primitives::rng rn_gen; + primitives::engine eng(4014 + n_); - rn_gen.uniform(n_, solution_.get_mutable_data(), eng.get_state(), -1.0, 1.0); + uniform(n_, solution_.get_mutable_data(), eng, -1.0, 1.0); create_stable_matrix(this->get_queue(), A_host, float_t(0.1), float_t(5.0)); @@ -164,7 +164,7 @@ class newton_cg_test : public te::float_algo_fixture { auto buffer = ndarray::empty(this->get_queue(), { n_ }, sycl::usm::alloc::host); for (std::int32_t test_num = 0; test_num < 5; ++test_num) { - rn_gen.uniform(n_, x_host.get_mutable_data(), eng.get_state(), -1.0, 1.0); + uniform(n_, x_host.get_mutable_data(), eng, -1.0, 1.0); auto x_gpu = x_host.to_device(this->get_queue()); auto compute_event_vec = func_->update_x(x_gpu, true, {}); wait_or_pass(compute_event_vec).wait_and_throw(); diff --git a/cpp/oneapi/dal/backend/primitives/rng/dpc_engine.hpp b/cpp/oneapi/dal/backend/primitives/rng/dpc_engine.hpp new file mode 100644 index 00000000000..9b9745f4cfa --- /dev/null +++ b/cpp/oneapi/dal/backend/primitives/rng/dpc_engine.hpp @@ -0,0 +1,153 @@ +/******************************************************************************* +* Copyright contributors to the oneDAL project +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#pragma once + +#include "oneapi/dal/backend/primitives/rng/utils.hpp" +#include "oneapi/dal/backend/primitives/rng/rng_types.hpp" +#include + +namespace mkl = oneapi::mkl; +namespace oneapi::dal::backend::primitives { + +#ifdef ONEDAL_DATA_PARALLEL + +template +struct dpc_engine_type; + +template <> +struct dpc_engine_type { + using type = oneapi::mkl::rng::mt2203; +}; + +template <> +struct dpc_engine_type { + using type = oneapi::mkl::rng::mcg59; +}; + +template <> +struct dpc_engine_type { + using type = oneapi::mkl::rng::mt19937; +}; + +template <> +struct dpc_engine_type { + using type = oneapi::mkl::rng::mrg32k3a; +}; + +template <> +struct dpc_engine_type { + using type = oneapi::mkl::rng::philox4x32x10; +}; + +/// A class that provides a unified interface for random number generation on both CPU and GPU devices. +/// +/// This class serves as a wrapper for random number generators (RNGs) that supports different engine types, +/// enabling efficient random number generation on heterogeneous platforms using SYCL. It integrates a host +/// (CPU) engine and a device (GPU) engine, allowing operations to be executed seamlessly on the appropriate +/// device. +/// +/// @tparam EngineType The RNG engine type to be used. Defaults to `engine_method::mt2203`. +/// +/// @param[in] queue The SYCL queue used to manage device operations. +/// @param[in] seed The initial seed for the random number generator. Defaults to `777`. +/// +/// The class provides functionality to skip ahead in the RNG sequence, retrieve engine states, and +/// manage host and device engines independently. Support for `skip_ahead` on GPU is currently limited for +/// some engine types. +template +class dpc_engine { +public: + using dpc_engine_t = typename dpc_engine_type::type; + + explicit dpc_engine(sycl::queue& queue, std::int64_t seed = 777) + : q(queue), + host_engine_(initialize_host_engine(seed)), + dpc_engine_(initialize_dpc_engine(queue, seed)), + impl_(dynamic_cast( + host_engine_.get())) { + if (!impl_) { + throw std::domain_error("RNG engine is not supported"); + } + } + + virtual ~dpc_engine() = default; + + void* get_host_engine_state() const { + return impl_->getState(); + } + + auto& get_cpu_engine() { + return host_engine_; + } + + auto& get_gpu_engine() { + return dpc_engine_; + } + + void skip_ahead_cpu(size_t nSkip) { + host_engine_->skipAhead(nSkip); + } + + void skip_ahead_gpu(size_t nSkip) { + // Will be supported in the next oneMKL release. + if constexpr (EngineType == engine_method::mt2203) { + } + else { + skip_ahead(dpc_engine_, nSkip); + } + } + + sycl::queue& get_queue() { + return q; + } + +private: + daal::algorithms::engines::EnginePtr initialize_host_engine(std::int64_t seed) { + switch (EngineType) { + case engine_method::mt2203: + return daal::algorithms::engines::mt2203::Batch<>::create(seed); + case engine_method::mcg59: + return daal::algorithms::engines::mcg59::Batch<>::create(seed); + case engine_method::mrg32k3a: + return daal::algorithms::engines::mrg32k3a::Batch<>::create(seed); + case engine_method::philox4x32x10: + return daal::algorithms::engines::philox4x32x10::Batch<>::create(seed); + case engine_method::mt19937: + return daal::algorithms::engines::mt19937::Batch<>::create(seed); + default: throw std::invalid_argument("Unsupported engine type"); + } + } + + dpc_engine_t initialize_dpc_engine(sycl::queue& queue, std::int64_t seed) { + if constexpr (EngineType == engine_method::mt2203) { + return dpc_engine_t( + queue, + seed, + 0); // Aligns CPU and GPU results for mt2203, impacts the performance. + } + else { + return dpc_engine_t(queue, seed); + } + } + sycl::queue q; + daal::algorithms::engines::EnginePtr host_engine_; + dpc_engine_t dpc_engine_; + daal::algorithms::engines::internal::BatchBaseImpl* impl_; +}; + +#endif +} // namespace oneapi::dal::backend::primitives diff --git a/cpp/oneapi/dal/backend/primitives/rng/host_engine.hpp b/cpp/oneapi/dal/backend/primitives/rng/host_engine.hpp new file mode 100644 index 00000000000..c4b2c807674 --- /dev/null +++ b/cpp/oneapi/dal/backend/primitives/rng/host_engine.hpp @@ -0,0 +1,100 @@ +/******************************************************************************* +* Copyright contributors to the oneDAL project +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#pragma once + +#include "oneapi/dal/backend/primitives/rng/rng_types.hpp" +#include "oneapi/dal/backend/primitives/rng/rng.hpp" +#include "oneapi/dal/backend/primitives/rng/utils.hpp" + +#include +#include +#include + +namespace oneapi::dal::backend::primitives { + +/// A class that provides an interface for random number generation on the host (CPU) only. +/// +/// This class serves as a wrapper for host-based random number generators (RNGs), supporting multiple engine +/// types for flexible and efficient random number generation on CPU. It abstracts the underlying engine +/// implementation and provides an interface to manage and retrieve the engine's state. +/// +/// @tparam EngineType The RNG engine type to be used. Defaults to `engine_method::mt2203`. +/// +/// @param[in] seed The initial seed for the random number generator. Defaults to `777`. +/// +/// @note The class only supports host-based RNG and does not require a SYCL queue or device context. +template +class host_engine { +public: + explicit host_engine(std::int64_t seed = 777) + : host_engine_(initialize_host_engine(seed)), + impl_(dynamic_cast( + host_engine_.get())) { + if (!impl_) { + throw std::domain_error("RNG engine is not supported"); + } + } + + explicit host_engine(const daal::algorithms::engines::EnginePtr& eng) : host_engine_(eng) { + impl_ = dynamic_cast(eng.get()); + if (!impl_) { + throw domain_error(dal::detail::error_messages::rng_engine_is_not_supported()); + } + } + + host_engine& operator=(const daal::algorithms::engines::EnginePtr& eng) { + host_engine_ = eng; + impl_ = dynamic_cast(eng.get()); + if (!impl_) { + throw domain_error(dal::detail::error_messages::rng_engine_is_not_supported()); + } + + return *this; + } + + virtual ~host_engine() = default; + + void* get_host_engine_state() const { + return impl_->getState(); + } + + auto& get_host_engine() { + return host_engine_; + } + +private: + daal::algorithms::engines::EnginePtr initialize_host_engine(std::int64_t seed) { + switch (EngineType) { + case engine_method::mt2203: + return daal::algorithms::engines::mt2203::Batch<>::create(seed); + case engine_method::mcg59: + return daal::algorithms::engines::mcg59::Batch<>::create(seed); + case engine_method::mrg32k3a: + return daal::algorithms::engines::mrg32k3a::Batch<>::create(seed); + case engine_method::philox4x32x10: + return daal::algorithms::engines::philox4x32x10::Batch<>::create(seed); + case engine_method::mt19937: + return daal::algorithms::engines::mt19937::Batch<>::create(seed); + default: throw std::invalid_argument("Unsupported engine type"); + } + } + + daal::algorithms::engines::EnginePtr host_engine_; + daal::algorithms::engines::internal::BatchBaseImpl* impl_; +}; + +} // namespace oneapi::dal::backend::primitives diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng.hpp new file mode 100644 index 00000000000..83125ba73e7 --- /dev/null +++ b/cpp/oneapi/dal/backend/primitives/rng/rng.hpp @@ -0,0 +1,134 @@ +/******************************************************************************* +* Copyright contributors to the oneDAL project +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#pragma once + +#include "oneapi/dal/backend/primitives/rng/host_engine.hpp" + +#ifdef ONEDAL_DATA_PARALLEL + +#include "oneapi/dal/backend/primitives/rng/dpc_engine.hpp" + +#endif + +namespace oneapi::dal::backend::primitives { + +template +void uniform(Size count, Type* dst, host_engine& host_engine, Type a, Type b) { + auto state = host_engine.get_host_engine_state(); + uniform_dispatcher::uniform_by_cpu(count, dst, state, a, b); +} + +template +void uniform_without_replacement(Size count, + Type* dst, + Type* buffer, + host_engine host_engine, + Type a, + Type b) { + auto state = host_engine.get_host_engine_state(); + uniform_dispatcher::uniform_without_replacement_by_cpu(count, dst, buffer, state, a, b); +} + +template >> +void shuffle(Size count, Type* dst, host_engine host_engine) { + auto state = host_engine.get_host_engine_state(); + Type idx[2]; + for (Size i = 0; i < count; ++i) { + uniform_dispatcher::uniform_by_cpu(2, idx, state, 0, count); + std::swap(dst[idx[0]], dst[idx[1]]); + } +} + +#ifdef ONEDAL_DATA_PARALLEL +template +void uniform(Size count, Type* dst, dpc_engine& engine_, Type a, Type b) { + if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) == + sycl::usm::alloc::device) { + throw domain_error(dal::detail::error_messages::unsupported_data_type()); + } + auto state = engine_.get_host_engine_state(); + uniform_dispatcher::uniform_by_cpu(count, dst, state, a, b); + engine_.skip_ahead_gpu(count); +} + +template +void uniform_without_replacement(Size count, + Type* dst, + Type* buffer, + dpc_engine& engine_, + Type a, + Type b) { + if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) == + sycl::usm::alloc::device) { + throw domain_error(dal::detail::error_messages::unsupported_data_type()); + } + void* state = engine_.get_host_engine_state(); + uniform_dispatcher::uniform_without_replacement_by_cpu(count, dst, buffer, state, a, b); + engine_.skip_ahead_gpu(count); +} + +template >> +void shuffle(Size count, Type* dst, dpc_engine& engine_) { + if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) == + sycl::usm::alloc::device) { + throw domain_error(dal::detail::error_messages::unsupported_data_type()); + } + Type idx[2]; + void* state = engine_.get_host_engine_state(); + for (Size i = 0; i < count; ++i) { + uniform_dispatcher::uniform_by_cpu(2, idx, state, 0, count); + std::swap(dst[idx[0]], dst[idx[1]]); + } + engine_.skip_ahead_gpu(count); +} + +template +void uniform(sycl::queue& queue, + Size count, + Type* dst, + dpc_engine& engine_, + Type a, + Type b, + const event_vector& deps = {}); + +template +void uniform_without_replacement(sycl::queue& queue, + Size count, + Type* dst, + Type* buffer, + dpc_engine& engine_, + Type a, + Type b, + const event_vector& deps = {}); + +template +void shuffle(sycl::queue& queue, + Size count, + Type* dst, + dpc_engine& engine_, + const event_vector& deps = {}); +#endif + +}; // namespace oneapi::dal::backend::primitives diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp new file mode 100644 index 00000000000..4ad09c4cc99 --- /dev/null +++ b/cpp/oneapi/dal/backend/primitives/rng/rng_dpc.cpp @@ -0,0 +1,186 @@ +/******************************************************************************* +* Copyright contributors to the oneDAL project +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include +#include "oneapi/dal/backend/primitives/rng/rng.hpp" +#include "oneapi/dal/backend/primitives/ndarray.hpp" + +namespace oneapi::dal::backend::primitives { + +namespace bk = oneapi::dal::backend; + +template +void uniform(sycl::queue& queue, + Size count, + Type* dst, + dpc_engine& engine_, + Type a, + Type b, + const event_vector& deps) { + if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) == sycl::usm::alloc::host) { + throw domain_error(dal::detail::error_messages::unsupported_data_type()); + } + oneapi::mkl::rng::uniform distr(a, b); + auto event = oneapi::mkl::rng::generate(distr, engine_.get_gpu_engine(), count, dst, { deps }); + event.wait_and_throw(); + engine_.skip_ahead_cpu(count); +} + +//Currently only CPU impl +template +void uniform_without_replacement(sycl::queue& queue, + Size count, + Type* dst, + Type* buffer, + dpc_engine& engine_, + Type a, + Type b, + const event_vector& deps) { + if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) == + sycl::usm::alloc::device) { + throw domain_error(dal::detail::error_messages::unsupported_data_type()); + } + void* state = engine_.get_host_engine_state(); + engine_.skip_ahead_gpu(count); + uniform_dispatcher::uniform_without_replacement_by_cpu(count, dst, buffer, state, a, b); +} + +//Currently only CPU impl +template +void shuffle(sycl::queue& queue, + Size count, + Type* dst, + dpc_engine& engine_, + const event_vector& deps) { + Type idx[2]; + if (sycl::get_pointer_type(dst, engine_.get_queue().get_context()) == + sycl::usm::alloc::device) { + throw domain_error(dal::detail::error_messages::unsupported_data_type()); + } + void* state = engine_.get_host_engine_state(); + engine_.skip_ahead_gpu(count); + + for (Size i = 0; i < count; ++i) { + uniform_dispatcher::uniform_by_cpu(2, idx, state, 0, count); + std::swap(dst[idx[0]], dst[idx[1]]); + } +} + +#define INSTANTIATE_(F, Size, EngineType) \ + template ONEDAL_EXPORT void uniform(sycl::queue& queue, \ + Size count_, \ + F* dst, \ + dpc_engine& engine_, \ + F a, \ + F b, \ + const event_vector& deps); + +#define INSTANTIATE_FLOAT_(Size) \ + INSTANTIATE_(float, Size, engine_method::mt2203) \ + INSTANTIATE_(float, Size, engine_method::mcg59) \ + INSTANTIATE_(float, Size, engine_method::mrg32k3a) \ + INSTANTIATE_(float, Size, engine_method::philox4x32x10) \ + INSTANTIATE_(float, Size, engine_method::mt19937) \ + INSTANTIATE_(double, Size, engine_method::mt2203) \ + INSTANTIATE_(double, Size, engine_method::mcg59) \ + INSTANTIATE_(double, Size, engine_method::mrg32k3a) \ + INSTANTIATE_(double, Size, engine_method::philox4x32x10) \ + INSTANTIATE_(double, Size, engine_method::mt19937) \ + INSTANTIATE_(std::int32_t, Size, engine_method::mt2203) \ + INSTANTIATE_(std::int32_t, Size, engine_method::mcg59) \ + INSTANTIATE_(std::int32_t, Size, engine_method::mrg32k3a) \ + INSTANTIATE_(std::int32_t, Size, engine_method::philox4x32x10) \ + INSTANTIATE_(std::int32_t, Size, engine_method::mt19937) +INSTANTIATE_FLOAT_(std::int64_t); +INSTANTIATE_FLOAT_(std::int32_t); + +#define INSTANTIATE_uniform_without_replacement(F, Size, EngineType) \ + template ONEDAL_EXPORT void uniform_without_replacement(sycl::queue& queue, \ + Size count_, \ + F* dst, \ + F* buff, \ + dpc_engine& engine_, \ + F a, \ + F b, \ + const event_vector& deps); + +#define INSTANTIATE_uniform_without_replacement_FLOAT(Size) \ + INSTANTIATE_uniform_without_replacement(float, Size, engine_method::mt2203) \ + INSTANTIATE_uniform_without_replacement( \ + float, \ + Size, \ + engine_method::mcg59) INSTANTIATE_uniform_without_replacement(float, \ + Size, \ + engine_method::mrg32k3a) \ + INSTANTIATE_uniform_without_replacement(float, Size, engine_method::philox4x32x10) \ + INSTANTIATE_uniform_without_replacement(float, Size, engine_method::mt19937) \ + INSTANTIATE_uniform_without_replacement(double, Size, engine_method::mt2203) \ + INSTANTIATE_uniform_without_replacement(double, \ + Size, \ + engine_method::mcg59) \ + INSTANTIATE_uniform_without_replacement(double, \ + Size, \ + engine_method::mrg32k3a) \ + INSTANTIATE_uniform_without_replacement( \ + double, \ + Size, \ + engine_method::philox4x32x10) \ + INSTANTIATE_uniform_without_replacement( \ + double, \ + Size, \ + engine_method::mt19937) \ + INSTANTIATE_uniform_without_replacement( \ + std::int32_t, \ + Size, \ + engine_method::mt2203) \ + INSTANTIATE_uniform_without_replacement( \ + std::int32_t, \ + Size, \ + engine_method::mcg59) \ + INSTANTIATE_uniform_without_replacement( \ + std::int32_t, \ + Size, \ + engine_method::mrg32k3a) \ + INSTANTIATE_uniform_without_replacement( \ + std::int32_t, \ + Size, \ + engine_method::philox4x32x10) \ + INSTANTIATE_uniform_without_replacement( \ + std::int32_t, \ + Size, \ + engine_method::mt19937) + +INSTANTIATE_uniform_without_replacement_FLOAT(std::int64_t); +INSTANTIATE_uniform_without_replacement_FLOAT(std::int32_t); + +#define INSTANTIATE_SHUFFLE(F, Size, EngineType) \ + template ONEDAL_EXPORT void shuffle(sycl::queue& queue, \ + Size count_, \ + F* dst, \ + dpc_engine& engine_, \ + const event_vector& deps); + +#define INSTANTIATE_SHUFFLE_FLOAT(Size) \ + INSTANTIATE_SHUFFLE(std::int32_t, Size, engine_method::mt2203) \ + INSTANTIATE_SHUFFLE(std::int32_t, Size, engine_method::mcg59) \ + INSTANTIATE_SHUFFLE(std::int32_t, Size, engine_method::mrg32k3a) \ + INSTANTIATE_SHUFFLE(std::int32_t, Size, engine_method::philox4x32x10) \ + INSTANTIATE_SHUFFLE(std::int32_t, Size, engine_method::mt19937) + +INSTANTIATE_SHUFFLE_FLOAT(std::int64_t); +INSTANTIATE_SHUFFLE_FLOAT(std::int32_t); + +} // namespace oneapi::dal::backend::primitives diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_engine.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_engine.hpp deleted file mode 100644 index c8ca3b13ce9..00000000000 --- a/cpp/oneapi/dal/backend/primitives/rng/rng_engine.hpp +++ /dev/null @@ -1,101 +0,0 @@ -/******************************************************************************* -* Copyright 2021 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*******************************************************************************/ - -#pragma once - -#include - -#include "oneapi/dal/backend/primitives/rng/utils.hpp" - -namespace oneapi::dal::backend::primitives { - -template -class rng { -public: - rng() = default; - ~rng() = default; - - void uniform(Size count, Type* dst, void* state, Type a, Type b) { - uniform_dispatcher::uniform_by_cpu(count, dst, state, a, b); - } - - void uniform_without_replacement(Size count, - Type* dst, - Type* buffer, - void* state, - Type a, - Type b) { - uniform_dispatcher::uniform_without_replacement_by_cpu(count, - dst, - buffer, - state, - a, - b); - } - - template >> - void shuffle(Size count, Type* dst, void* state) { - Type idx[2]; - - for (Size i = 0; i < count; ++i) { - uniform_dispatcher::uniform_by_cpu(2, idx, state, 0, count); - std::swap(dst[idx[0]], dst[idx[1]]); - } - } - -private: - daal::internal::RNGsInst daal_rng_; -}; - -class engine { -public: - explicit engine(std::int64_t seed = 777) - : engine_(daal::algorithms::engines::mt2203::Batch<>::create(seed)) { - impl_ = dynamic_cast(engine_.get()); - if (!impl_) { - throw domain_error(dal::detail::error_messages::rng_engine_is_not_supported()); - } - } - - explicit engine(const daal::algorithms::engines::EnginePtr& eng) : engine_(eng) { - impl_ = dynamic_cast(eng.get()); - if (!impl_) { - throw domain_error(dal::detail::error_messages::rng_engine_is_not_supported()); - } - } - - virtual ~engine() = default; - - engine& operator=(const daal::algorithms::engines::EnginePtr& eng) { - engine_ = eng; - impl_ = dynamic_cast(eng.get()); - if (!impl_) { - throw domain_error(dal::detail::error_messages::rng_engine_is_not_supported()); - } - - return *this; - } - - void* get_state() const { - return impl_->getState(); - } - -private: - daal::algorithms::engines::EnginePtr engine_; - daal::algorithms::engines::internal::BatchBaseImpl* impl_; -}; - -} // namespace oneapi::dal::backend::primitives diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp index 09a5a589141..e7e19f64c4d 100644 --- a/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp +++ b/cpp/oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp @@ -16,12 +16,18 @@ #pragma once -#include "oneapi/dal/backend/primitives/rng/rng_engine.hpp" - +#include "oneapi/dal/backend/primitives/rng/rng.hpp" +#include "oneapi/dal/backend/primitives/ndarray.hpp" #include +#include "oneapi/dal/backend/primitives/rng/utils.hpp" +#include "oneapi/dal/backend/primitives/rng/rng_types.hpp" +#include "oneapi/dal/table/common.hpp" + namespace oneapi::dal::backend::primitives { +#ifdef ONEDAL_DATA_PARALLEL + template class engine_collection { public: @@ -30,10 +36,10 @@ class engine_collection { engine_(daal::algorithms::engines::mt2203::Batch<>::create(seed)), params_(count), technique_(daal::algorithms::engines::internal::family), - daal_engine_list_(count) {} + host_engine_method_(count) {} template - std::vector operator()(Op&& op) { + std::vector> operator()(Op&& op) { daal::services::Status status; for (Size i = 0; i < count_; ++i) { op(i, params_.nSkip[i]); @@ -43,25 +49,25 @@ class engine_collection { engine_, technique_, params_, - daal_engine_list_, + host_engine_method_, &status); if (!status) { dal::backend::interop::status_to_exception(status); } - std::vector engine_list(count_); + std::vector> engine_method(count_); for (Size i = 0; i < count_; ++i) { - engine_list[i] = daal_engine_list_[i]; + engine_method[i] = host_engine_method_[i]; } //copy elision - return engine_list; + return engine_method; } private: void select_parallelization_technique( daal::algorithms::engines::internal::ParallelizationTechnique& technique) { - auto daal_engine_impl = + auto host_engine_impl = dynamic_cast(engine_.get()); daal::algorithms::engines::internal::ParallelizationTechnique techniques[] = { @@ -71,7 +77,7 @@ class engine_collection { }; for (auto& techn : techniques) { - if (daal_engine_impl->hasSupport(techn)) { + if (host_engine_impl->hasSupport(techn)) { technique = techn; return; } @@ -87,7 +93,30 @@ class engine_collection { daal::algorithms::engines::internal::Params params_; daal::algorithms::engines::internal::ParallelizationTechnique technique_; daal::services::internal::TArray - daal_engine_list_; + host_engine_method_; +}; + +template +class engine_collection_oneapi { +public: + engine_collection_oneapi(sycl::queue& queue, Size count, std::int64_t seed = 777) + : count_(count), + seed_(seed) { + engines_.reserve(count_); + for (Size i = 0; i < count_; ++i) { + engines_.push_back(dpc_engine(queue, seed_)); + } + } + + std::vector> get_engines() const { + return engines_; + } + +private: + Size count_; + std::int64_t seed_; + std::vector> engines_; }; +#endif } // namespace oneapi::dal::backend::primitives diff --git a/cpp/oneapi/dal/backend/primitives/rng/rng_types.hpp b/cpp/oneapi/dal/backend/primitives/rng/rng_types.hpp new file mode 100644 index 00000000000..4132fbe557a --- /dev/null +++ b/cpp/oneapi/dal/backend/primitives/rng/rng_types.hpp @@ -0,0 +1,29 @@ +/******************************************************************************* +* Copyright contributors to the oneDAL project +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#pragma once + +#include +#include +#include +#include +#include + +namespace oneapi::dal::backend::primitives { + +enum class engine_method { mt2203, mcg59, mt19937, mrg32k3a, philox4x32x10 }; + +} diff --git a/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp new file mode 100644 index 00000000000..2a079f15466 --- /dev/null +++ b/cpp/oneapi/dal/backend/primitives/rng/test/rng_dpc.cpp @@ -0,0 +1,237 @@ +/******************************************************************************* +* Copyright contributors to the oneDAL project +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*******************************************************************************/ + +#include "oneapi/dal/test/engine/common.hpp" +#include "oneapi/dal/test/engine/fixtures.hpp" +#include "oneapi/dal/test/engine/dataframe.hpp" + +#include "oneapi/dal/backend/primitives/rng/rng.hpp" +#include "oneapi/dal/backend/primitives/rng/rng_engine_collection.hpp" +namespace oneapi::dal::backend::primitives::test { + +namespace te = dal::test::engine; + +class mt2203 {}; +class mcg59 {}; +class mrg32k3a {}; +class mt19937 {}; +class philox4x32x10 {}; + +template +struct engine_map {}; + +template <> +struct engine_map { + constexpr static auto value = engine_method::mt2203; +}; + +template <> +struct engine_map { + constexpr static auto value = engine_method::mcg59; +}; + +template <> +struct engine_map { + constexpr static auto value = engine_method::mrg32k3a; +}; + +template <> +struct engine_map { + constexpr static auto value = engine_method::philox4x32x10; +}; + +template <> +struct engine_map { + constexpr static auto value = engine_method::mt19937; +}; + +template +constexpr auto engine_v = engine_map::value; + +template +class rng_test : public te::policy_fixture { +public: + using DataType = std::tuple_element_t<0, TestType>; + using EngineType = std::tuple_element_t<1, TestType>; + static constexpr auto engine_test_type = engine_v; + + auto get_host_engine(std::int64_t seed) { + auto rng_engine = host_engine(seed); + return rng_engine; + } + + auto get_dpc_engine(std::int64_t seed) { + auto rng_engine = dpc_engine(this->get_queue(), seed); + return rng_engine; + } + + auto allocate_array_host(std::int64_t elem_count) { + auto arr_host = ndarray::empty({ elem_count }); + return arr_host; + } + + auto allocate_array_device(std::int64_t elem_count) { + auto& q = this->get_queue(); + auto arr_gpu = ndarray::empty(q, { elem_count }, sycl::usm::alloc::device); + return arr_gpu; + } + + void check_results(const ndarray& arr_1, const ndarray& arr_2) { + const auto arr_1_host = arr_1.to_host(this->get_queue()); + const DataType* val_arr_1_host_ptr = arr_1_host.get_data(); + + const auto arr_2_host = arr_2.to_host(this->get_queue()); + const DataType* val_arr_2_host_ptr = arr_2_host.get_data(); + + for (std::int64_t el = 0; el < arr_2_host.get_count(); el++) { + // Due to MKL inside generates floats on GPU and doubles on CPU, it makes sense to add minor eps. + REQUIRE(abs(val_arr_1_host_ptr[el] - val_arr_2_host_ptr[el]) < 0.1); + } + } +}; + +using rng_types = COMBINE_TYPES((float, double), (mt2203, mt19937, mcg59, mrg32k3a, philox4x32x10)); + +TEMPLATE_LIST_TEST_M(rng_test, "rng cpu vs gpu", "[rng]", rng_types) { + SKIP_IF(this->get_policy().is_cpu()); + using Float = std::tuple_element_t<0, TestType>; + + std::int64_t elem_count = GENERATE_COPY(10, 777, 10000, 50000); + std::int64_t seed = GENERATE_COPY(777, 999); + + auto arr_gpu = this->allocate_array_device(elem_count); + auto arr_host = this->allocate_array_host(elem_count); + auto arr_gpu_ptr = arr_gpu.get_mutable_data(); + auto arr_host_ptr = arr_host.get_mutable_data(); + + auto rng_engine = this->get_dpc_engine(seed); + auto rng_engine_ = this->get_dpc_engine(seed); + + uniform(elem_count, arr_host_ptr, rng_engine, 0, elem_count); + uniform(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine_, 0, elem_count); + + this->check_results(arr_gpu, arr_host); +} + +using rng_types_skip_ahead_support = COMBINE_TYPES((float, double), + (mt19937, mcg59, mrg32k3a, philox4x32x10)); + +TEMPLATE_LIST_TEST_M(rng_test, "mixed rng cpu skip", "[rng]", rng_types_skip_ahead_support) { + SKIP_IF(this->get_policy().is_cpu()); + using Float = std::tuple_element_t<0, TestType>; + + std::int64_t elem_count = GENERATE_COPY(10, 777, 10000, 100000); + std::int64_t seed = GENERATE_COPY(777, 999); + + auto arr_host_init_1 = this->allocate_array_host(elem_count); + auto arr_host_init_2 = this->allocate_array_host(elem_count); + + auto arr_gpu = this->allocate_array_device(elem_count); + auto arr_host = this->allocate_array_host(elem_count); + + auto arr_host_init_1_ptr = arr_host_init_1.get_mutable_data(); + auto arr_host_init_2_ptr = arr_host_init_2.get_mutable_data(); + auto arr_gpu_ptr = arr_gpu.get_mutable_data(); + auto arr_host_ptr = arr_host.get_mutable_data(); + + auto rng_engine = this->get_dpc_engine(seed); + auto rng_engine_2 = this->get_dpc_engine(seed); + + uniform(elem_count, arr_host_init_1_ptr, rng_engine, 0, elem_count); + uniform(elem_count, arr_host_init_2_ptr, rng_engine_2, 0, elem_count); + + uniform(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine, 0, elem_count); + uniform(elem_count, arr_host_ptr, rng_engine_2, 0, elem_count); + + this->check_results(arr_host_init_1, arr_host_init_2); + this->check_results(arr_gpu, arr_host); +} + +TEMPLATE_LIST_TEST_M(rng_test, "mixed rng gpu skip", "[rng]", rng_types_skip_ahead_support) { + SKIP_IF(this->get_policy().is_cpu()); + using Float = std::tuple_element_t<0, TestType>; + + std::int64_t elem_count = GENERATE_COPY(10, 100, 777, 10000); + std::int64_t seed = GENERATE_COPY(1, 777, 999); + + auto arr_device_init_1 = this->allocate_array_device(elem_count); + auto arr_device_init_2 = this->allocate_array_device(elem_count); + + auto arr_gpu = this->allocate_array_device(elem_count); + auto arr_host = this->allocate_array_host(elem_count); + + auto arr_device_init_1_ptr = arr_device_init_1.get_mutable_data(); + auto arr_device_init_2_ptr = arr_device_init_2.get_mutable_data(); + auto arr_gpu_ptr = arr_gpu.get_mutable_data(); + auto arr_host_ptr = arr_host.get_mutable_data(); + + auto rng_engine = this->get_dpc_engine(seed); + auto rng_engine_2 = this->get_dpc_engine(seed); + + uniform(this->get_queue(), elem_count, arr_device_init_1_ptr, rng_engine, 0, elem_count); + uniform(this->get_queue(), + elem_count, + arr_device_init_2_ptr, + rng_engine_2, + 0, + elem_count); + + uniform(this->get_queue(), elem_count, arr_gpu_ptr, rng_engine, 0, elem_count); + uniform(elem_count, arr_host_ptr, rng_engine_2, 0, elem_count); + + this->check_results(arr_device_init_1, arr_device_init_2); + this->check_results(arr_gpu, arr_host); +} + +//TODO: add engine collection test + host_engine tests +// TEMPLATE_LIST_TEST_M(rng_test, "mixed rng gpu skip collection", "[rng]", rng_types_skip) { +// SKIP_IF(this->get_policy().is_cpu()); +// std::int64_t elem_count = GENERATE_COPY(10, 100, 777, 10000); +// std::int64_t seed = GENERATE_COPY(1, 777, 999); + +// engine_collection collection(this->get_queue(), 2, seed); + +// auto engine_arr = collection.get_dpc_engines(); + +// auto [arr_device_init_1, arr_device_init_2] = this->allocate_arrays_shared(elem_count); + +// auto arr_device_init_1_ptr = arr_device_init_1.get_mutable_data(); +// auto arr_device_init_2_ptr = arr_device_init_2.get_mutable_data(); + +// auto rn_gen = this->get_rng(); + +// rn_gen.uniform(this->get_queue(), +// elem_count, +// arr_device_init_1_ptr, +// engine_arr[0], +// 0, +// elem_count); + +// rn_gen.uniform(this->get_queue(), +// elem_count, +// arr_device_init_2_ptr, +// engine_arr[1], +// 0, +// elem_count); + +// // rn_gen.uniform(this->get_queue(), elem_count, arr_gpu_ptr, engine_arr[0], 0, elem_count); +// // rn_gen.uniform(elem_count, arr_host_ptr, engine_arr[1], 0, elem_count); + +// //this->check_results_device(arr_device_init_1, arr_device_init_2); +// this->check_results(arr_device_init_1, arr_device_init_2); +// } + +} // namespace oneapi::dal::backend::primitives::test diff --git a/cpp/oneapi/dal/detail/profiler.cpp b/cpp/oneapi/dal/detail/profiler.cpp index 4fbb21aec49..48215741415 100644 --- a/cpp/oneapi/dal/detail/profiler.cpp +++ b/cpp/oneapi/dal/detail/profiler.cpp @@ -15,27 +15,101 @@ *******************************************************************************/ #include "oneapi/dal/detail/profiler.hpp" +#include namespace oneapi::dal::detail { + +profiler::profiler() { + start_time = get_time(); +} + +profiler::~profiler() { + auto end_time = get_time(); + auto total_time = end_time - start_time; + std::cerr << "KERNEL_PROFILER: total time " << total_time / 1e6 << std::endl; +} + +std::uint64_t profiler::get_time() { + struct timespec t; + clock_gettime(CLOCK_MONOTONIC, &t); + return t.tv_sec * 1000000000 + t.tv_nsec; +} + +profiler* profiler::get_instance() { + static profiler instance; + return &instance; +} + +task& profiler::get_task() { + return task_; +} + +#ifdef ONEDAL_DATA_PARALLEL +sycl::queue& profiler::get_queue() { + return queue_; +} + +void profiler::set_queue(const sycl::queue& q) { + queue_ = q; +} +#endif + profiler_task profiler::start_task(const char* task_name) { + auto ns_start = get_time(); + auto& tasks_info = get_instance()->get_task(); + tasks_info.time_kernels[tasks_info.current_kernel] = ns_start; + tasks_info.current_kernel++; return profiler_task(task_name); } -void profiler::end_task(const char* task_name) {} +void profiler::end_task(const char* task_name) { + const std::uint64_t ns_end = get_time(); + auto& tasks_info = get_instance()->get_task(); +#ifdef ONEDAL_DATA_PARALLEL + auto& queue = get_instance()->get_queue(); + queue.wait_and_throw(); +#endif + tasks_info.current_kernel--; + const std::uint64_t times = ns_end - tasks_info.time_kernels[tasks_info.current_kernel]; -profiler_task::profiler_task(const char* task_name) : task_name_(task_name) {} + auto it = tasks_info.kernels.find(task_name); + if (it == tasks_info.kernels.end()) { + tasks_info.kernels.insert({ task_name, times }); + } + else { + it->second += times; + } + std::cerr << "KERNEL_PROFILER: " << std::string(task_name) << " " << times / 1e6 << std::endl; +} #ifdef ONEDAL_DATA_PARALLEL -profiler_task profiler::start_task(const char* task_name, const sycl::queue& task_queue) { +profiler_task profiler::start_task(const char* task_name, sycl::queue& task_queue) { + task_queue.wait_and_throw(); + get_instance()->set_queue(task_queue); + auto ns_start = get_time(); + auto& tasks_info = get_instance()->get_task(); + tasks_info.time_kernels[tasks_info.current_kernel] = ns_start; + tasks_info.current_kernel++; return profiler_task(task_name, task_queue); } + + profiler_task::profiler_task(const char* task_name, const sycl::queue& task_queue) : task_name_(task_name), - task_queue_(task_queue) {} + task_queue_(task_queue), + has_queue_(true) {} + #endif +profiler_task::profiler_task(const char* task_name) + : task_name_(task_name) {} + profiler_task::~profiler_task() { + #ifdef ONEDAL_DATA_PARALLEL + if (has_queue_) + task_queue_.wait_and_throw(); + #endif // ONEDAL_DATA_PARALLEL profiler::end_task(task_name_); } diff --git a/cpp/oneapi/dal/detail/profiler.hpp b/cpp/oneapi/dal/detail/profiler.hpp index 749d8f618a0..3eacba0ee63 100644 --- a/cpp/oneapi/dal/detail/profiler.hpp +++ b/cpp/oneapi/dal/detail/profiler.hpp @@ -19,6 +19,14 @@ #ifdef ONEDAL_DATA_PARALLEL #include #endif + + +#include +#include +#include +#include +#include +#include #define ONEDAL_PROFILER_CONCAT2(x, y) x##y #define ONEDAL_PROFILER_CONCAT(x, y) ONEDAL_PROFILER_CONCAT2(x, y) @@ -39,6 +47,16 @@ namespace oneapi::dal::detail { + + +struct task { + static const std::uint64_t MAX_KERNELS = 256; + std::map kernels; + std::uint64_t current_kernel = 0; + std::uint64_t time_kernels[MAX_KERNELS]; + void clear(); +}; + class profiler_task { public: profiler_task(const char* task_name); @@ -47,24 +65,38 @@ class profiler_task { #endif ~profiler_task(); - profiler_task(profiler_task& other) = delete; - - profiler_task& operator=(profiler_task& other) = delete; - private: const char* task_name_; #ifdef ONEDAL_DATA_PARALLEL sycl::queue task_queue_; + bool has_queue_; #endif }; class profiler { public: + profiler(); + ~profiler(); static profiler_task start_task(const char* task_name); + static std::uint64_t get_time(); + static profiler* get_instance(); + task& get_task(); + #ifdef ONEDAL_DATA_PARALLEL - static profiler_task start_task(const char* task_name, const sycl::queue& task_queue); + sycl::queue& get_queue(); + void set_queue(const sycl::queue& q); + + + static profiler_task start_task(const char* task_name, sycl::queue& task_queue); #endif static void end_task(const char* task_name); + +private: + std::uint64_t start_time; + task task_; +#ifdef ONEDAL_DATA_PARALLEL + sycl::queue queue_; +#endif }; } // namespace oneapi::dal::detail diff --git a/docs/source/daal/algorithms/engines/index.rst b/docs/source/daal/algorithms/engines/index.rst index 9def0af4c81..133513fd9f2 100644 --- a/docs/source/daal/algorithms/engines/index.rst +++ b/docs/source/daal/algorithms/engines/index.rst @@ -111,4 +111,6 @@ These methods are represented with member functions of classes that represent fu mt19937.rst mcg59.rst + mrg32k3a.rst + philox4x32x10.rst mt2203.rst diff --git a/docs/source/daal/algorithms/engines/mrg32k3a.rst b/docs/source/daal/algorithms/engines/mrg32k3a.rst new file mode 100644 index 00000000000..e931c801890 --- /dev/null +++ b/docs/source/daal/algorithms/engines/mrg32k3a.rst @@ -0,0 +1,62 @@ +.. Copyright contributors to the oneDAL project +.. +.. Licensed under the Apache License, Version 2.0 (the "License"); +.. you may not use this file except in compliance with the License. +.. You may obtain a copy of the License at +.. +.. http://www.apache.org/licenses/LICENSE-2.0 +.. +.. Unless required by applicable law or agreed to in writing, software +.. distributed under the License is distributed on an "AS IS" BASIS, +.. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +.. See the License for the specific language governing permissions and +.. limitations under the License. + +mrg32k3a +======== + +The engine based on a 32-bit combined multiple recursive generator +with two components of order 3, optimized for batch processing. + +.. rubric:: Subsequence selection methods support + +skipAhead (nskip) + Supported +leapfrog (threadIdx, nThreads) + Supported + +Batch Processing +**************** + +mrg32k3a engine needs the initial condition (``seed``) for state initialization. +The seed can be either an integer scalar or a vector of :math:`p` integer elements, the inputs to the respective engine constructors. + +.. rubric:: Algorithm Parameters + +mrg32k3a engine has the following parameters: + +.. tabularcolumns:: |\Y{0.2}|\Y{0.2}|\Y{0.6}| + +.. list-table:: Algorithm Parameters for mcg58 engine (Batch Processing) + :header-rows: 1 + :widths: 10 20 30 + :align: left + :class: longtable + + * - Parameter + - Default Value + - Description + * - ``algorithmFPType`` + - ``float`` + - The floating-point type that the algorithm uses for intermediate computations. Can be ``float`` or ``double``. + * - ``method`` + - ``defaultDense`` + - Performance-oriented computation method; the only method supported by the algorithm. + * - ``seed`` + - + - :math:`777` for a scalar seed + - NA for a vector seed + - Initial condition for state initialization, scalar or vector: + + - Scalar, value of ``size_t`` type + - Vector, pointer to ``HomogenNumericTable`` of size :math:`1 \times p` diff --git a/docs/source/daal/algorithms/engines/philox4x32x10.rst b/docs/source/daal/algorithms/engines/philox4x32x10.rst new file mode 100644 index 00000000000..ac50ea80fdb --- /dev/null +++ b/docs/source/daal/algorithms/engines/philox4x32x10.rst @@ -0,0 +1,62 @@ +.. Copyright contributors to the oneDAL project +.. +.. Licensed under the Apache License, Version 2.0 (the "License"); +.. you may not use this file except in compliance with the License. +.. You may obtain a copy of the License at +.. +.. http://www.apache.org/licenses/LICENSE-2.0 +.. +.. Unless required by applicable law or agreed to in writing, software +.. distributed under the License is distributed on an "AS IS" BASIS, +.. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +.. See the License for the specific language governing permissions and +.. limitations under the License. + +philox4x32x10 +============= + +Implementation of the Philox4x32-10 engine: a counter-based pseudorandom number generator (PRNG) +that uses 4x32-bit keys and performs 10 rounds of mixing to produce high-quality randomness. + +.. rubric:: Subsequence selection methods support + +skipAhead (nskip) + Supported +leapfrog (threadIdx, nThreads) + Supported + +Batch Processing +**************** + +philox4x32x10 engine needs the initial condition (``seed``) for state initialization. +The seed can be either an integer scalar or a vector of :math:`p` integer elements, the inputs to the respective engine constructors. + +.. rubric:: Algorithm Parameters + +philox4x32x10 engine has the following parameters: + +.. tabularcolumns:: |\Y{0.2}|\Y{0.2}|\Y{0.6}| + +.. list-table:: Algorithm Parameters for mcg58 engine (Batch Processing) + :header-rows: 1 + :widths: 10 20 30 + :align: left + :class: longtable + + * - Parameter + - Default Value + - Description + * - ``algorithmFPType`` + - ``float`` + - The floating-point type that the algorithm uses for intermediate computations. Can be ``float`` or ``double``. + * - ``method`` + - ``defaultDense`` + - Performance-oriented computation method; the only method supported by the algorithm. + * - ``seed`` + - + - :math:`777` for a scalar seed + - NA for a vector seed + - Initial condition for state initialization, scalar or vector: + + - Scalar, value of ``size_t`` type + - Vector, pointer to ``HomogenNumericTable`` of size :math:`1 \times p` diff --git a/makefile.lst b/makefile.lst index 92dc52ff521..b042ede80a7 100755 --- a/makefile.lst +++ b/makefile.lst @@ -65,7 +65,7 @@ multiclassclassifier += classifier k_nearest_neighbors += engines classifier logistic_regression += classifier optimization_solver objective_function engines implicit_als += engines distributions -engines += engines/mt19937 engines/mcg59 engines/mt2203 +engines += engines/mt19937 engines/mcg59 engines/mrg32k3a engines/philox4x32x10 engines/mt2203 distributions += distributions/bernoulli distributions/normal distributions/uniform tsne += @@ -95,6 +95,8 @@ CORE.ALGORITHMS.FULL := \ elastic_net \ engines \ engines/mcg59 \ + engines/mrg32k3a \ + engines/philox4x32x10 \ engines/mt19937 \ engines/mt2203 \ em \ @@ -309,6 +311,8 @@ JJ.ALGORITHMS := adaboost elastic_net/prediction \ engines \ engines/mcg59 \ + engines/mrg32k3a \ + engines/philox4x32x10 \ engines/mt19937 \ engines/mt2203 \ em_gmm \