Skip to content

Commit 197aad9

Browse files
committed
Parallelize hierarchical DRC processing with OpenMP
- Replace sequential task queues in dbHierProcessor and dbCompoundOperation with OpenMP tasking and parallel loops - Add thread safety to CompoundRegionOperationCache using tl::Mutex - Enable OpenMP compiler and linker flags in klayout.pri - Update _drc_engine.rb documentation to explicitly state thread allocation for hierarchical mode
1 parent 6270877 commit 197aad9

6 files changed

Lines changed: 160 additions & 52 deletions

File tree

build.sh

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ HAVE_EXPAT=0
4242
HAVE_GIT2=1
4343
HAVE_LSTREAM=1
4444
HAVE_CPP20=0
45+
HAVE_OPENMP=0
4546

4647
RUBYINCLUDE=""
4748
RUBYINCLUDE2=""
@@ -103,6 +104,9 @@ while [ "$*" != "" ]; do
103104
-without-qtbinding)
104105
HAVE_QTBINDINGS=0
105106
;;
107+
-with-openmp)
108+
HAVE_OPENMP=1
109+
;;
106110
-without-qt-uitools)
107111
HAVE_QT_UITOOLS=0
108112
;;
@@ -255,6 +259,7 @@ while [ "$*" != "" ]; do
255259
echo " -with-qtbinding Create Qt bindings for ruby scripts [default]"
256260
echo " -without-qtbinding Don't create Qt bindings for ruby scripts"
257261
echo " -without-qt-uitools Don't include uitools in Qt binding"
262+
echo " -with-openmp Enable OpenMP parallelization for hierarchical processing"
258263
echo " -with-64bit-coord Use long (64bit) coordinates - EXPERIMENTAL FEATURE"
259264
echo " (only available for gcc>=4.4 for 64bit build)"
260265
echo " -without-64bit-coord Don't use long (64bit) coordinates [default]"
@@ -601,6 +606,7 @@ echo " HAVE_PNG=$HAVE_PNG"
601606
echo " HAVE_EXPAT=$HAVE_EXPAT"
602607
echo " HAVE_GIT2=$HAVE_GIT2"
603608
echo " HAVE_LSTREAM=$HAVE_LSTREAM"
609+
echo " HAVE_OPENMP=$HAVE_OPENMP"
604610
echo " RPATH=$RPATH"
605611

606612
mkdir -p $BUILD
@@ -676,6 +682,7 @@ qmake_options=(
676682
HAVE_GIT2="$HAVE_GIT2"
677683
HAVE_LSTREAM="$HAVE_LSTREAM"
678684
HAVE_CPP20="$HAVE_CPP20"
685+
HAVE_OPENMP="$HAVE_OPENMP"
679686
PREFIX="$BIN"
680687
RPATH="$RPATH"
681688
KLAYOUT_VERSION="$KLAYOUT_VERSION"

src/db/db/dbCompoundOperation.cc

Lines changed: 69 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -766,46 +766,52 @@ CompoundRegionGeometricalBoolOperationNode::implement_bool (CompoundRegionOperat
766766
one_a.push_back (std::unordered_set<T1> ());
767767

768768
shape_interactions<T, T> computed_a;
769-
child (0)->compute_local (cache, layout, cell, interactions_for_child (interactions, 0, computed_a), one_a, proc);
770769

771-
if (one_a.front ().empty ()) {
772-
773-
if (m_op == GeometricalOp::And || m_op == GeometricalOp::Not) {
774-
775-
// .. no results ..
770+
std::vector<std::unordered_set<T2> > one_b;
771+
one_b.push_back (std::unordered_set<T2> ());
776772

777-
} else {
773+
shape_interactions<T, T> computed_b;
778774

779-
std::vector<std::unordered_set<T2> > one_b;
780-
one_b.push_back (std::unordered_set<T2> ());
775+
bool can_parallel = (m_op != GeometricalOp::And && m_op != GeometricalOp::Not);
781776

782-
shape_interactions<T, T> computed_b;
777+
#if defined(_OPENMP)
778+
if (can_parallel && proc->threads() > 0) {
779+
#pragma omp task shared(one_a, computed_a, cache, layout, cell, interactions, proc)
780+
{
781+
child (0)->compute_local (cache, layout, cell, interactions_for_child (interactions, 0, computed_a), one_a, proc);
782+
}
783+
#pragma omp task shared(one_b, computed_b, cache, layout, cell, interactions, proc)
784+
{
785+
child (1)->compute_local (cache, layout, cell, interactions_for_child (interactions, 1, computed_b), one_b, proc);
786+
}
787+
#pragma omp taskwait
788+
} else
789+
#endif
790+
{
791+
child (0)->compute_local (cache, layout, cell, interactions_for_child (interactions, 0, computed_a), one_a, proc);
792+
if (!one_a.front().empty()) {
783793
child (1)->compute_local (cache, layout, cell, interactions_for_child (interactions, 1, computed_b), one_b, proc);
794+
} else {
795+
if (!can_parallel) { // And or Not and A is empty
796+
return; // nothing to do, results remain empty
797+
}
798+
}
799+
}
784800

801+
if (one_a.front ().empty ()) {
802+
if (!can_parallel) {
803+
// .. no results ..
804+
} else {
785805
copy_results (results, one_b);
786-
787806
}
788-
789807
} else {
790-
791-
std::vector<std::unordered_set<T2> > one_b;
792-
one_b.push_back (std::unordered_set<T2> ());
793-
794-
shape_interactions<T, T> computed_b;
795-
child (1)->compute_local (cache, layout, cell, interactions_for_child (interactions, 1, computed_b), one_b, proc);
796-
797808
if (one_b.front ().empty ()) {
798-
799809
if (m_op != GeometricalOp::And) {
800810
copy_results (results, one_a);
801811
}
802-
803812
} else {
804-
805813
run_bool (m_op, layout, one_a.front (), one_b.front (), results.front ());
806-
807814
}
808-
809815
}
810816
}
811817

@@ -934,30 +940,54 @@ void compound_region_generic_operation_node<TS, TI, TR>::implement_compute_local
934940
shape_interactions<TTS, TTI> self_interactions_heap;
935941
const shape_interactions<TTS, TTI> &self_interactions = interactions_for_child (interactions, 0, self_interactions_heap);
936942

937-
self->compute_local (cache, layout, cell, self_interactions, self_result, proc);
938-
939-
db::generic_shape_iterator <TS> is (self_result.front ().begin (), self_result.front ().end ());
940-
941943
std::vector<db::generic_shape_iterator<TI> > iiv;
942944
std::vector<std::unordered_set<TI> > intruder_results;
943-
intruder_results.reserve (children () - 1); // important, so that the memory layout will not change while we generate them
945+
intruder_results.resize (children () - 1); // allocate memory upfront
944946

945-
for (unsigned int ci = 1; ci < children (); ++ci) {
947+
#if defined(_OPENMP)
948+
if (proc->threads() > 0) {
949+
#pragma omp task shared(self_result, self_interactions_heap, cache, layout, cell, interactions, proc)
950+
{
951+
self->compute_local (cache, layout, cell, self_interactions, self_result, proc);
952+
}
953+
for (unsigned int ci = 1; ci < children (); ++ci) {
954+
#pragma omp task shared(intruder_results, cache, layout, cell, interactions, proc) firstprivate(ci)
955+
{
956+
const CompoundRegionOperationNode *intruder = child (ci);
957+
std::vector<std::unordered_set<TI> > intruder_result;
958+
intruder_result.push_back (std::unordered_set<TI> ());
959+
960+
shape_interactions<TTS, TTI> intruder_interactions_heap;
961+
const shape_interactions<TTS, TTI> &intruder_interactions = interactions_for_child (interactions, ci, intruder_interactions_heap);
962+
963+
intruder->compute_local (cache, layout, cell, intruder_interactions, intruder_result, proc);
964+
intruder_results[ci - 1] = std::move(intruder_result.front());
965+
}
966+
}
967+
#pragma omp taskwait
968+
} else
969+
#endif
970+
{
971+
self->compute_local (cache, layout, cell, self_interactions, self_result, proc);
946972

947-
const CompoundRegionOperationNode *intruder = child (ci);
948-
std::vector<std::unordered_set<TI> > intruder_result;
949-
intruder_result.push_back (std::unordered_set<TI> ());
973+
for (unsigned int ci = 1; ci < children (); ++ci) {
950974

951-
shape_interactions<TTS, TTI> intruder_interactions_heap;
952-
const shape_interactions<TTS, TTI> &intruder_interactions = interactions_for_child (interactions, ci, intruder_interactions_heap);
975+
const CompoundRegionOperationNode *intruder = child (ci);
976+
std::vector<std::unordered_set<TI> > intruder_result;
977+
intruder_result.push_back (std::unordered_set<TI> ());
953978

954-
intruder->compute_local (cache, layout, cell, intruder_interactions, intruder_result, proc);
979+
shape_interactions<TTS, TTI> intruder_interactions_heap;
980+
const shape_interactions<TTS, TTI> &intruder_interactions = interactions_for_child (interactions, ci, intruder_interactions_heap);
955981

956-
intruder_results.push_back (std::unordered_set<TI> ());
957-
intruder_results.back ().swap (intruder_result.front ());
982+
intruder->compute_local (cache, layout, cell, intruder_interactions, intruder_result, proc);
983+
intruder_results[ci - 1] = std::move(intruder_result.front());
984+
}
985+
}
958986

959-
iiv.push_back (db::generic_shape_iterator<TI> (intruder_results.back ().begin (), intruder_results.back ().end ()));
987+
db::generic_shape_iterator <TS> is (self_result.front ().begin (), self_result.front ().end ());
960988

989+
for (unsigned int ci = 1; ci < children (); ++ci) {
990+
iiv.push_back (db::generic_shape_iterator <TI> (intruder_results[ci - 1].begin (), intruder_results[ci - 1].end ()));
961991
}
962992

963993
db::local_processor <TS, TI, TR> lproc (layout);

src/db/db/dbCompoundOperation.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,19 +57,23 @@ class CompoundRegionOperationNode;
5757
* This cache is important to avoid duplicate evaluation of the same node in
5858
* a diamond-graph structure of nodes.
5959
*/
60+
#include "tlThreads.h"
61+
6062
class DB_PUBLIC CompoundRegionOperationCache
6163
{
6264
public:
6365
template <class TR>
6466
std::pair<bool, std::vector<std::unordered_set<TR> > *> get (const CompoundRegionOperationNode *node)
6567
{
68+
tl::MutexLocker lock (&m_mutex);
6669
bool valid = false;
6770
std::vector<std::unordered_set<TR> > *cache = 0;
6871
get_cache (cache, valid, node);
6972
return std::make_pair (valid, cache);
7073
}
7174

7275
private:
76+
tl::Mutex m_mutex;
7377
std::map<const CompoundRegionOperationNode *, std::vector<std::unordered_set<db::PolygonRefWithProperties> > > m_cache_polyref_wp;
7478
std::map<const CompoundRegionOperationNode *, std::vector<std::unordered_set<db::PolygonWithProperties> > > m_cache_poly_wp;
7579
std::map<const CompoundRegionOperationNode *, std::vector<std::unordered_set<db::EdgeWithProperties> > > m_cache_edge_wp;

src/db/db/dbHierProcessor.cc

Lines changed: 63 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -890,18 +890,41 @@ void local_processor<TS, TI, TR>::compute_contexts (local_processor_contexts<TS,
890890

891891
tl::SelfTimer timer (tl::verbosity () > base_verbosity () + 10, tl::to_string (tr ("Computing contexts for ")) + description (op));
892892

893+
#if defined(_OPENMP)
894+
if (threads () > 0) {
895+
mp_cc_job.reset (0);
896+
} else {
897+
mp_cc_job.reset (0);
898+
}
899+
#else
893900
if (threads () > 0) {
894901
mp_cc_job.reset (new tl::Job<local_processor_context_computation_worker<TS, TI, TR> > (threads ()));
895902
} else {
896903
mp_cc_job.reset (0);
897904
}
905+
#endif
898906

899907
contexts.clear ();
900908
contexts.set_intruder_layers (intruder_layers);
901909
contexts.set_subject_layer (subject_layer);
902910

903911
typename local_processor_cell_contexts<TS, TI, TR>::context_key_type intruders;
912+
#if defined(_OPENMP)
913+
if (threads() > 0) {
914+
int nthreads = threads();
915+
#pragma omp parallel num_threads(nthreads) shared(contexts, intruders)
916+
{
917+
#pragma omp single
918+
{
919+
issue_compute_contexts (contexts, 0, 0, mp_subject_top, db::ICplxTrans (), mp_intruder_top, intruders, op->dist ());
920+
}
921+
}
922+
} else {
923+
issue_compute_contexts (contexts, 0, 0, mp_subject_top, db::ICplxTrans (), mp_intruder_top, intruders, op->dist ());
924+
}
925+
#else
904926
issue_compute_contexts (contexts, 0, 0, mp_subject_top, db::ICplxTrans (), mp_intruder_top, intruders, op->dist ());
927+
#endif
905928

906929
if (mp_cc_job.get ()) {
907930
mp_cc_job->start ();
@@ -926,11 +949,24 @@ void local_processor<TS, TI, TR>::issue_compute_contexts (local_processor_contex
926949
{
927950
bool is_small_job = subject_cell->begin ().at_end ();
928951

952+
#if defined(_OPENMP)
953+
if (! is_small_job && threads() > 0) {
954+
typename local_processor_cell_contexts<TS, TI, TR>::context_key_type my_intruders;
955+
my_intruders.swap (intruders);
956+
#pragma omp task shared(contexts) firstprivate(parent_context, subject_parent, subject_cell, subject_cell_inst, intruder_cell, my_intruders, dist)
957+
{
958+
compute_contexts (contexts, parent_context, subject_parent, subject_cell, subject_cell_inst, intruder_cell, my_intruders, dist);
959+
}
960+
} else {
961+
compute_contexts (contexts, parent_context, subject_parent, subject_cell, subject_cell_inst, intruder_cell, intruders, dist);
962+
}
963+
#else
929964
if (! is_small_job && mp_cc_job.get ()) {
930965
mp_cc_job->schedule (new local_processor_context_computation_task<TS, TI, TR> (this, contexts, parent_context, subject_parent, subject_cell, subject_cell_inst, intruder_cell, intruders, dist));
931966
} else {
932967
compute_contexts (contexts, parent_context, subject_parent, subject_cell, subject_cell_inst, intruder_cell, intruders, dist);
933968
}
969+
#endif
934970
}
935971

936972
template <class TS, class TI, class TR>
@@ -1164,8 +1200,6 @@ local_processor<TS, TI, TR>::compute_results (local_processor_contexts<TS, TI, T
11641200

11651201
if (threads () > 0) {
11661202

1167-
std::unique_ptr<tl::Job<local_processor_result_computation_worker<TS, TI, TR> > > rc_job (new tl::Job<local_processor_result_computation_worker<TS, TI, TR> > (threads ()));
1168-
11691203
// schedule computation jobs in "waves": we need to make sure they are executed
11701204
// bottom-up. So we identify a new bunch of cells each time we pass through the cell set
11711205
// and proceed until all cells are removed.
@@ -1188,6 +1222,8 @@ local_processor<TS, TI, TR>::compute_results (local_processor_contexts<TS, TI, T
11881222
std::vector<db::cell_index_type> next_cells_bu;
11891223
next_cells_bu.reserve (cells_bu.size ());
11901224

1225+
std::vector<local_processor_result_computation_task<TS, TI, TR>*> tasks;
1226+
11911227
for (std::vector<db::cell_index_type>::const_iterator bu = cells_bu.begin (); bu != cells_bu.end (); ++bu) {
11921228

11931229
tl::MutexLocker locker (& contexts.lock ());
@@ -1197,7 +1233,7 @@ local_processor<TS, TI, TR>::compute_results (local_processor_contexts<TS, TI, T
11971233

11981234
if (later.find (*bu) == later.end ()) {
11991235

1200-
rc_job->schedule (new local_processor_result_computation_task<TS, TI, TR> (this, contexts, cpc->first, &cpc->second, op, output_layers));
1236+
tasks.push_back(new local_processor_result_computation_task<TS, TI, TR> (this, contexts, cpc->first, &cpc->second, op, output_layers));
12011237
any = true;
12021238

12031239
} else {
@@ -1218,20 +1254,37 @@ local_processor<TS, TI, TR>::compute_results (local_processor_contexts<TS, TI, T
12181254
break;
12191255
}
12201256

1221-
if (rc_job.get ()) {
1222-
1257+
if (!tasks.empty()) {
12231258
try {
1224-
1225-
rc_job->start ();
1259+
#if defined(_OPENMP)
1260+
int nthreads = threads();
1261+
#pragma omp parallel for num_threads(nthreads) schedule(dynamic)
1262+
for (long long i = 0; i < (long long)tasks.size(); ++i) {
1263+
tasks[i]->perform();
1264+
}
1265+
#else
1266+
std::unique_ptr<tl::Job<local_processor_result_computation_worker<TS, TI, TR> > > rc_job (new tl::Job<local_processor_result_computation_worker<TS, TI, TR> > (threads ()));
1267+
for (size_t i = 0; i < tasks.size(); ++i) {
1268+
rc_job->schedule(tasks[i]);
1269+
}
1270+
rc_job->start();
12261271
while (! rc_job->wait (10)) {
12271272
progress.set (get_progress ());
12281273
}
1229-
1274+
#endif
12301275
} catch (...) {
1231-
rc_job->terminate ();
1276+
#if !defined(_OPENMP)
1277+
// rc_job cleanup will be handled by the smart pointer, but we don't have it explicitly throwing here in openmp mode
1278+
#endif
1279+
for (size_t i = 0; i < tasks.size(); ++i) { delete tasks[i]; }
12321280
throw;
12331281
}
1234-
1282+
#if defined(_OPENMP)
1283+
for (size_t i = 0; i < tasks.size(); ++i) {
1284+
delete tasks[i];
1285+
}
1286+
progress.set(get_progress());
1287+
#endif
12351288
}
12361289

12371290
}

src/drc/drc/built-in-macros/_drc_engine.rb

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1238,11 +1238,11 @@ def flat
12381238

12391239
# %DRC%
12401240
# @name threads
1241-
# @brief Specifies the number of CPU cores to use in tiling mode
1241+
# @brief Specifies the number of CPU cores to use in tiling and hierarchical mode
12421242
# @synopsis threads(n)
12431243
# @synopsis threads
1244-
# If using threads, tiles are distributed on multiple CPU cores for
1245-
# parallelization. Still, all tiles must be processed before the
1244+
# If using threads, tiles or hierarchical cells are distributed on multiple CPU cores for
1245+
# parallelization. Still, all tiles or cells must be processed before the
12461246
# operation proceeds with the next statement.
12471247
#
12481248
# Without an argument, "threads" will return the current number of

src/klayout.pri

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,20 @@ msvc {
216216
}
217217
}
218218

219+
equals(HAVE_OPENMP, "1") {
220+
msvc {
221+
QMAKE_CXXFLAGS += /openmp
222+
QMAKE_LFLAGS += /openmp
223+
} else:macx {
224+
QMAKE_CXXFLAGS += -Xpreprocessor -fopenmp
225+
LIBS += -lomp
226+
} else {
227+
QMAKE_CXXFLAGS += -fopenmp
228+
QMAKE_LFLAGS += -fopenmp
229+
}
230+
DEFINES += _OPENMP
231+
}
232+
219233
win32 {
220234

221235
QMAKE_LFLAGS += -Wl,--exclude-all-symbols

0 commit comments

Comments
 (0)