Skip to content

Commit a8f643d

Browse files
committed
Add profiling to identify first-inference NPU overhead root cause
1 parent e5720d6 commit a8f643d

File tree

4 files changed

+190
-108
lines changed

4 files changed

+190
-108
lines changed

src/plugins/intel_npu/src/backend/include/zero_pipeline.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ struct Pipeline {
7373
bool _sync_output_with_fences = true;
7474
uint32_t _extension_version;
7575
Logger _logger;
76+
size_t _infer_count = 0;
7677

7778
private:
7879
void enable_profiling();

src/plugins/intel_npu/src/backend/src/zero_infer_request.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@
44

55
#include "zero_infer_request.hpp"
66

7+
#include <chrono>
8+
#include <cstdio>
9+
710
#include "intel_npu/common/itt.hpp"
811
#include "intel_npu/config/options.hpp"
912
#include "intel_npu/prefix.hpp"
@@ -709,7 +712,15 @@ void ZeroInferRequest::prepare_inputs() {
709712

710713
if (!_pipelineIsCreated || _dynamicBatchValueChanged) {
711714
OV_ITT_TASK_NEXT(ZERO_INFER, "create_pipeline");
715+
const auto pipeline_start = std::chrono::steady_clock::now();
712716
create_pipeline(); // Reallocate pipeline if necessary
717+
const auto pipeline_end = std::chrono::steady_clock::now();
718+
const auto pipeline_duration_us =
719+
std::chrono::duration_cast<std::chrono::microseconds>(pipeline_end - pipeline_start).count();
720+
fprintf(stderr, "[ZeroInferRequest] create_pipeline() took %.3f ms (first_creation=%d, dynamic_batch_changed=%d)\n",
721+
pipeline_duration_us / 1000.0,
722+
!_pipelineIsCreated,
723+
_dynamicBatchValueChanged);
713724
_pipelineIsCreated = true;
714725
_dynamicBatchValueChanged = false; // Reset reallocation flag
715726
} else {

src/plugins/intel_npu/src/backend/src/zero_pipeline.cpp

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@
77
#include <ze_api.h>
88
#include <ze_graph_ext.h>
99

10+
#include <chrono>
11+
#include <cstdio>
12+
1013
#include "intel_npu/common/itt.hpp"
1114
#include "intel_npu/config/options.hpp"
1215
#include "intel_npu/utils/logger/logger.hpp"
@@ -110,6 +113,8 @@ Pipeline::Pipeline(const Config& config,
110113
}
111114
}
112115

116+
const auto t_cmdlist_start = std::chrono::steady_clock::now();
117+
113118
_command_lists.reserve(_number_of_command_lists);
114119
for (size_t i = 0; i < _number_of_command_lists; i++) {
115120
_command_lists.emplace_back(
@@ -124,6 +129,10 @@ Pipeline::Pipeline(const Config& config,
124129
}
125130
}
126131

132+
const auto t_cmdlist_end = std::chrono::steady_clock::now();
133+
const auto t_setargs_start = std::chrono::steady_clock::now();
134+
size_t total_set_argument_calls = 0;
135+
127136
for (size_t i = 0; i < _number_of_command_lists; i++) {
128137
_logger.debug("Pipeline - set args for command list number: %zu", i);
129138
size_t io_index = 0;
@@ -146,6 +155,7 @@ Pipeline::Pipeline(const Config& config,
146155
tensor->data(),
147156
get_strides(tensor->get_strides(), tensor->get_element_type().size()));
148157
}
158+
++total_set_argument_calls;
149159
++io_index;
150160
continue;
151161
}
@@ -162,6 +172,7 @@ Pipeline::Pipeline(const Config& config,
162172
get_strides(tensor->get_strides(), tensor->get_element_type().size()));
163173
}
164174

175+
++total_set_argument_calls;
165176
++io_index;
166177
}
167178

@@ -178,6 +189,7 @@ Pipeline::Pipeline(const Config& config,
178189
static_cast<unsigned char*>(tensor->data()) + (i * tensor->get_strides()[0]),
179190
get_strides(tensor->get_strides(), tensor->get_element_type().size()));
180191
}
192+
++total_set_argument_calls;
181193
++io_index;
182194
}
183195

@@ -217,6 +229,18 @@ Pipeline::Pipeline(const Config& config,
217229
_events.at(i)->AppendSignalEvent(*_command_lists.at(i));
218230
}
219231
}
232+
233+
const auto t_setargs_end = std::chrono::steady_clock::now();
234+
const auto cmdlist_us =
235+
std::chrono::duration_cast<std::chrono::microseconds>(t_cmdlist_end - t_cmdlist_start).count();
236+
const auto setargs_us =
237+
std::chrono::duration_cast<std::chrono::microseconds>(t_setargs_end - t_setargs_start).count();
238+
fprintf(stderr, "[Pipeline] Pipeline() breakdown: command_list_create=%.3f ms, "
239+
"set_argument_values=%.3f ms (%zu calls)\n",
240+
cmdlist_us / 1000.0,
241+
setargs_us / 1000.0,
242+
total_set_argument_calls);
243+
220244
_logger.debug("Pipeline - initialize completed");
221245
}
222246

@@ -249,6 +273,8 @@ void Pipeline::push() {
249273
_graph->set_last_submitted_id(_id);
250274
}
251275

276+
const auto t_push_start = std::chrono::steady_clock::now();
277+
252278
for (size_t i = 0; i < _command_lists.size(); ++i) {
253279
_command_lists.at(i)->close();
254280

@@ -260,13 +286,23 @@ void Pipeline::push() {
260286
}
261287
}
262288

289+
if (_infer_count < 2) {
290+
const auto t_push_end = std::chrono::steady_clock::now();
291+
const auto push_us =
292+
std::chrono::duration_cast<std::chrono::microseconds>(t_push_end - t_push_start).count();
293+
fprintf(stderr, "[Pipeline] push() #%zu: %.3f ms (close+execute)\n",
294+
_infer_count, push_us / 1000.0);
295+
}
296+
263297
_logger.debug("Pipeline - push() completed");
264298
};
265299

266300
void Pipeline::pull() {
267301
_logger.debug("Pipeline - pull() started");
268302
OV_ITT_TASK_CHAIN(ZERO_PIPELINE_IP_PULL, itt::domains::LevelZeroBackend, "Pipeline", "pull");
269303

304+
const auto t_pull_start = std::chrono::steady_clock::now();
305+
270306
for (size_t i = 0; i < _command_lists.size(); ++i) {
271307
if (_sync_output_with_fences) {
272308
_fences.at(i)->hostSynchronize();
@@ -279,6 +315,16 @@ void Pipeline::pull() {
279315
}
280316
}
281317

318+
if (_infer_count < 2) {
319+
const auto t_pull_end = std::chrono::steady_clock::now();
320+
const auto pull_us =
321+
std::chrono::duration_cast<std::chrono::microseconds>(t_pull_end - t_pull_start).count();
322+
fprintf(stderr, "[Pipeline] pull() #%zu: %.3f ms (hostSynchronize)\n",
323+
_infer_count, pull_us / 1000.0);
324+
}
325+
326+
++_infer_count;
327+
282328
_logger.debug("Pipeline - pull() completed");
283329
};
284330

0 commit comments

Comments
 (0)