@@ -87,37 +87,37 @@ void RXMesh::init(const std::vector<std::vector<uint32_t>>& fv,
8787 m_timers.add (" buildHT" );
8888 m_timers.add (" cudaMalloc" );
8989 m_timers.add (" malloc" );
90+ m_timers.add (" hashtable.move" );
91+ m_timers.add (" cudaMemcpy" );
92+ m_timers.add (" bitmask.cudaMemcpy" );
9093
94+ // 1)
9195 m_timers.add (" build" );
9296 m_timers.start (" build" );
9397 build (fv, patcher_file);
9498 m_timers.stop (" build" );
95- RXMESH_INFO (" build time = {} (ms)" , m_timers.elapsed_millis (" build" ));
9699
100+ // 2)
97101 m_timers.add (" populate_patch_stash" );
98102 m_timers.start (" populate_patch_stash" );
99103 populate_patch_stash ();
100104 m_timers.stop (" populate_patch_stash" );
101- RXMESH_INFO (" populate_patch_stash time = {} (ms)" ,
102- m_timers.elapsed_millis (" populate_patch_stash" ));
103105
106+ // 3)
104107 m_timers.add (" coloring" );
105108 m_timers.start (" coloring" );
106109 patch_graph_coloring ();
107110 m_timers.stop (" coloring" );
108111 RXMESH_INFO (" Num colors = {}" , m_num_colors);
109- RXMESH_INFO (" patch graph coloring time = {} (ms)" ,
110- m_timers.elapsed_millis (" coloring" ));
111-
112112
113+ // 4)
113114 m_timers.add (" build_device" );
114115 m_timers.start (" build_device" );
115116 build_device ();
116117 m_timers.stop (" build_device" );
117- RXMESH_INFO (" build_device time = {} (ms)" ,
118- m_timers.elapsed_millis (" build_device" ));
119118
120119
120+ // 5)
121121 m_timers.add (" PatchScheduler" );
122122 m_timers.start (" PatchScheduler" );
123123 PatchScheduler sch;
@@ -126,18 +126,16 @@ void RXMesh::init(const std::vector<std::vector<uint32_t>>& fv,
126126 BYTES_TO_MEGABYTES (sizeof (uint32_t ) * get_max_num_patches ());
127127 sch.refill (get_num_patches ());
128128 m_timers.stop (" PatchScheduler" );
129- RXMESH_INFO (" PatchScheduler time = {} (ms)" ,
130- m_timers.elapsed_millis (" PatchScheduler" ));
131129
132130
131+ // 6)
133132 m_timers.add (" allocate_extra_patches" );
134133 m_timers.start (" allocate_extra_patches" );
135134 // Allocate extra patches
136135 allocate_extra_patches ();
137136 m_timers.stop (" allocate_extra_patches" );
138- RXMESH_INFO (" allocate_extra_patches time = {} (ms)" ,
139- m_timers.elapsed_millis (" allocate_extra_patches" ));
140137
138+ // 7)
141139 m_timers.add (" context.init" );
142140 m_timers.start (" context.init" );
143141 // Allocate and copy the context to the gpu
@@ -159,8 +157,6 @@ void RXMesh::init(const std::vector<std::vector<uint32_t>>& fv,
159157 m_d_patches_info,
160158 sch);
161159 m_timers.stop (" context.init" );
162- RXMESH_INFO (" context.init time = {} (ms)" ,
163- m_timers.elapsed_millis (" context.init" ));
164160
165161
166162 RXMESH_INFO (" #Vertices = {}, #Faces= {}, #Edges= {}, #Patches = {}" ,
@@ -181,19 +177,39 @@ void RXMesh::init(const std::vector<std::vector<uint32_t>>& fv,
181177 RXMESH_INFO (" per-patch maximum vertex count = {}" ,
182178 m_max_vertices_per_patch);
183179
184- RXMESH_INFO (" cudaMalloc time = {} (ms)" ,
185- m_timers.elapsed_millis (" cudaMalloc" ));
186-
187- RXMESH_INFO (" malloc time = {} (ms)" , m_timers.elapsed_millis (" malloc" ));
188-
189- RXMESH_INFO (" buildHT time = {} (ms)" , m_timers.elapsed_millis (" buildHT" ));
190- RXMESH_INFO (" bitmask time = {} (ms)" , m_timers.elapsed_millis (" bitmask" ));
191- RXMESH_INFO (" lower_bound time = {} (ms)" ,
180+ // //
181+ RXMESH_INFO (" 1) build time = {} (ms)" , m_timers.elapsed_millis (" build" ));
182+ RXMESH_INFO (" 2) populate_patch_stash time = {} (ms)" ,
183+ m_timers.elapsed_millis (" populate_patch_stash" ));
184+ RXMESH_INFO (" 3) patch graph coloring time = {} (ms)" ,
185+ m_timers.elapsed_millis (" coloring" ));
186+ RXMESH_INFO (" 4) build_device time = {} (ms)" ,
187+ m_timers.elapsed_millis (" build_device" ));
188+ RXMESH_INFO (" -buildHT time = {} (ms)" , m_timers.elapsed_millis (" buildHT" ));
189+ RXMESH_INFO (" --lower_bound time = {} (ms)" ,
192190 m_timers.elapsed_millis (" lower_bound" ));
193- RXMESH_INFO (" ht.insert time = {} (ms)" ,
191+ RXMESH_INFO (" -- ht.insert time = {} (ms)" ,
194192 m_timers.elapsed_millis (" ht.insert" ));
195- RXMESH_INFO (" LPHashTable time = {} (ms)" ,
193+ RXMESH_INFO (" --hashtable.move time = {} (ms)" ,
194+ m_timers.elapsed_millis (" hashtable.move" ));
195+ RXMESH_INFO (" --LPHashTable time = {} (ms)" ,
196196 m_timers.elapsed_millis (" LPHashTable" ));
197+ RXMESH_INFO (" -bitmask time = {} (ms)" , m_timers.elapsed_millis (" bitmask" ));
198+ RXMESH_INFO (" --bitmask.cudaMemcpy time = {} (ms)" ,
199+ m_timers.elapsed_millis (" bitmask.cudaMemcpy" ));
200+
201+ RXMESH_INFO (" 5) PatchScheduler time = {} (ms)" ,
202+ m_timers.elapsed_millis (" PatchScheduler" ));
203+ RXMESH_INFO (" 6) allocate_extra_patches time = {} (ms)" ,
204+ m_timers.elapsed_millis (" allocate_extra_patches" ));
205+ RXMESH_INFO (" 7) context.init time = {} (ms)" ,
206+ m_timers.elapsed_millis (" context.init" ));
207+
208+ RXMESH_INFO (" cudaMemcpy time = {} (ms)" ,
209+ m_timers.elapsed_millis (" cudaMemcpy" ));
210+ RXMESH_INFO (" cudaMalloc time = {} (ms)" ,
211+ m_timers.elapsed_millis (" cudaMalloc" ));
212+ RXMESH_INFO (" malloc time = {} (ms)" , m_timers.elapsed_millis (" malloc" ));
197213}
198214
199215RXMesh::~RXMesh ()
@@ -980,7 +996,7 @@ void RXMesh::build_device()
980996 BYTES_TO_MEGABYTES (get_max_num_patches () * sizeof (PatchInfo));
981997
982998
983- #pragma omp parallel for
999+ // #pragma omp parallel for
9841000 for (int p = 0 ; p < static_cast <int >(get_num_patches ()); ++p) {
9851001
9861002 const uint16_t p_num_vertices =
@@ -1094,6 +1110,7 @@ void RXMesh::build_device_single_patch(const uint32_t patch_id,
10941110 BYTES_TO_MEGABYTES (PatchStash::stash_size * sizeof (uint32_t ));
10951111
10961112 // copy count and capacities
1113+ m_timers.start (" cudaMemcpy" );
10971114 CUDA_ERROR (cudaMemcpy (d_patch.num_faces ,
10981115 h_patch_info.num_faces ,
10991116 sizeof (uint16_t ),
@@ -1106,6 +1123,7 @@ void RXMesh::build_device_single_patch(const uint32_t patch_id,
11061123 h_patch_info.num_vertices ,
11071124 sizeof (uint16_t ),
11081125 cudaMemcpyHostToDevice));
1126+ m_timers.stop (" cudaMemcpy" );
11091127
11101128 // allocate and copy patch topology to the device
11111129 // we realloc the host h_patch_info EV and FE to ensure that both host and
@@ -1122,10 +1140,12 @@ void RXMesh::build_device_single_patch(const uint32_t patch_id,
11221140 h_patch_info.ev , p_edges_capacity * 2 * sizeof (LocalVertexT));
11231141
11241142 if (p_num_edges > 0 ) {
1143+ m_timers.start (" cudaMemcpy" );
11251144 CUDA_ERROR (cudaMemcpy (d_patch.ev ,
11261145 h_patch_info.ev ,
11271146 p_num_edges * 2 * sizeof (LocalVertexT),
11281147 cudaMemcpyHostToDevice));
1148+ m_timers.stop (" cudaMemcpy" );
11291149 }
11301150
11311151 m_timers.start (" cudaMalloc" );
@@ -1140,10 +1160,12 @@ void RXMesh::build_device_single_patch(const uint32_t patch_id,
11401160 h_patch_info.fe , p_faces_capacity * 3 * sizeof (LocalEdgeT));
11411161
11421162 if (p_num_faces > 0 ) {
1163+ m_timers.start (" cudaMemcpy" );
11431164 CUDA_ERROR (cudaMemcpy (d_patch.fe ,
11441165 h_patch_info.fe ,
11451166 p_num_faces * 3 * sizeof (LocalEdgeT),
11461167 cudaMemcpyHostToDevice));
1168+ m_timers.stop (" cudaMemcpy" );
11471169 }
11481170
11491171 m_timers.start (" cudaMalloc" );
@@ -1183,8 +1205,10 @@ void RXMesh::build_device_single_patch(const uint32_t patch_id,
11831205 }
11841206 }
11851207
1208+ m_timers.start (" bitmask.cudaMemcpy" );
11861209 CUDA_ERROR (
11871210 cudaMemcpy (d_mask, h_mask, num_bytes, cudaMemcpyHostToDevice));
1211+ m_timers.stop (" bitmask.cudaMemcpy" );
11881212
11891213 m_timers.stop (" bitmask" );
11901214 };
@@ -1229,10 +1253,12 @@ void RXMesh::build_device_single_patch(const uint32_t patch_id,
12291253
12301254 // Copy PatchStash
12311255 if (patch_id != INVALID32) {
1256+ m_timers.start (" cudaMemcpy" );
12321257 CUDA_ERROR (cudaMemcpy (d_patch.patch_stash .m_stash ,
12331258 h_patch_info.patch_stash .m_stash ,
12341259 PatchStash::stash_size * sizeof (uint32_t ),
12351260 cudaMemcpyHostToDevice));
1261+ m_timers.stop (" cudaMemcpy" );
12361262 }
12371263
12381264
@@ -1298,7 +1324,9 @@ void RXMesh::build_device_single_patch(const uint32_t patch_id,
12981324 }
12991325 }
13001326
1327+ m_timers.start (" hashtable.move" );
13011328 d_hashtable.move (h_hashtable);
1329+ m_timers.stop (" hashtable.move" );
13021330
13031331 m_timers.stop (" buildHT" );
13041332 };
@@ -1339,9 +1367,10 @@ void RXMesh::build_device_single_patch(const uint32_t patch_id,
13391367 h_patch_info.lp_f ,
13401368 d_patch.lp_f );
13411369
1342-
1370+ m_timers. start ( " cudaMemcpy " );
13431371 CUDA_ERROR (cudaMemcpy (
13441372 &d_patch_info, &d_patch, sizeof (PatchInfo), cudaMemcpyHostToDevice));
1373+ m_timers.stop (" cudaMemcpy" );
13451374}
13461375
13471376void RXMesh::allocate_extra_patches ()
@@ -1351,7 +1380,7 @@ void RXMesh::allocate_extra_patches()
13511380 const uint16_t p_edges_capacity = get_per_patch_max_edge_capacity ();
13521381 const uint16_t p_faces_capacity = get_per_patch_max_face_capacity ();
13531382
1354- // #pragma omp parallel for
1383+ #pragma omp parallel for
13551384 for (int p = get_num_patches (); p < static_cast <int >(get_max_num_patches ());
13561385 ++p) {
13571386
0 commit comments