@@ -71,11 +71,13 @@ void SessionElement::prepare(const HostConfig& host_config, std::vector<long> cu
7171 // If the host config allows smaller buffers, we need to adjust the latency and number of structs
7272 if (host_config.m_allow_smaller_buffers ) {
7373 HostConfig adjusted_config = host_config;
74+ HostConfig min_config = host_config;
7475
7576 // Find the greatest relative buffersize and count down from there
7677 float greatest_buffer_size = 0 ;
7778 size_t greatest_buffer_size_index = 0 ;
7879 bool greatest_buffer_size_is_input = true ;
80+ float buffer_size_ratio = 1 .f ;
7981
8082 for (size_t i = 0 ; i < m_inference_config.get_tensor_input_shape ().size (); ++i) {
8183 if (m_inference_config.get_preprocess_input_size ()[i] > 0 ) {
@@ -95,32 +97,47 @@ void SessionElement::prepare(const HostConfig& host_config, std::vector<long> cu
9597 }
9698 }
9799
100+ // Calculate the minimum buffer size based on the greatest buffer size
101+ if (greatest_buffer_size_is_input) {
102+ buffer_size_ratio = 1 .f / static_cast <float >(m_inference_config.get_preprocess_input_size ()[greatest_buffer_size_index]);
103+ } else {
104+ buffer_size_ratio = 1 .f / static_cast <float >(m_inference_config.get_postprocess_output_size ()[greatest_buffer_size_index]);
105+ }
106+ min_config.m_buffer_size = buffer_size_ratio * static_cast <float >(m_inference_config.get_preprocess_input_size ()[host_config.m_tensor_index ]);
107+
98108 while (--greatest_buffer_size > 0 ) {
99- float greatest_buffer_size_ratio ;
109+ float buffer_size_ratio ;
100110 if (greatest_buffer_size_is_input) {
101- greatest_buffer_size_ratio = greatest_buffer_size / m_inference_config.get_preprocess_input_size ()[greatest_buffer_size_index];
111+ buffer_size_ratio = greatest_buffer_size / static_cast < float >( m_inference_config.get_preprocess_input_size ()[greatest_buffer_size_index]) ;
102112 } else {
103- greatest_buffer_size_ratio = greatest_buffer_size / m_inference_config.get_postprocess_output_size ()[greatest_buffer_size_index];
113+ buffer_size_ratio = greatest_buffer_size / static_cast < float >( m_inference_config.get_postprocess_output_size ()[greatest_buffer_size_index]) ;
104114 }
105- adjusted_config.m_buffer_size = greatest_buffer_size_ratio * m_inference_config.get_preprocess_input_size ()[host_config.m_tensor_index ];
115+ adjusted_config.m_buffer_size = buffer_size_ratio * static_cast <float >(m_inference_config.get_preprocess_input_size ()[host_config.m_tensor_index ]);
116+
106117 std::vector<float > adjusted_latency;
107118 for (size_t i = 0 ; i < m_inference_config.get_tensor_output_shape ().size (); ++i) {
108119 if (m_inference_config.get_postprocess_output_size ()[i] > 0 ) {
109- float adjusted_buffer_size = adjusted_config.get_relative_buffer_size (m_inference_config, i, false );
110- float adjusted_sample_rate = adjusted_config.get_relative_sample_rate (m_inference_config, i, false );
111120 float max_buffer_size = host_config.get_relative_buffer_size (m_inference_config, i, false );
112- float max_sample_rate = host_config.get_relative_sample_rate (m_inference_config, i, false );
113-
114- int buffer_adaptation = calculate_buffer_adaptation (adjusted_buffer_size, m_inference_config.get_postprocess_output_size ()[i]);
115- int buffer_adaptation_full_buffer = std::min (static_cast <int >(std::ceil (max_buffer_size)), (int ) m_inference_config.get_postprocess_output_size ()[i]);
116- buffer_adaptation = std::max (buffer_adaptation_full_buffer, buffer_adaptation);
117-
118- float wait_time = calculate_wait_time (adjusted_buffer_size, adjusted_sample_rate);
119- int inference_caused_latency_full_buffer = calculate_inference_caused_latency (max_num_inferences (adjusted_config), max_buffer_size, max_sample_rate, wait_time);
120- // Inference caused latency of only full buffers and one buffer of adjusted sample rate
121- inference_caused_latency_full_buffer = std::max (0 , inference_caused_latency_full_buffer - (static_cast <int >(std::ceil (max_buffer_size - adjusted_buffer_size))));
122- int inference_caused_latency = calculate_inference_caused_latency (max_num_inferences (adjusted_config), adjusted_buffer_size, adjusted_sample_rate, wait_time);
123- inference_caused_latency = std::max (inference_caused_latency, inference_caused_latency_full_buffer);
121+ float adjusted_buffer_size = adjusted_config.get_relative_buffer_size (m_inference_config, i, false );
122+ float min_buffer_size = min_config.get_relative_buffer_size (m_inference_config, i, false );
123+ float sample_rate = adjusted_config.get_relative_sample_rate (m_inference_config, i, false );
124+
125+ // When allowing smaller buffer sizes, the buffer adaptation is always the post-process output size minus one
126+ // Because we could have buffers of size one only and this is the maximum adaptation possible
127+ int buffer_adaptation = std::max (static_cast <int >(m_inference_config.get_postprocess_output_size ()[i]) - 1 , 0 );
128+
129+ float max_wait_time = calculate_wait_time (max_buffer_size, sample_rate);
130+ float adjusted_wait_time = calculate_wait_time (adjusted_buffer_size, sample_rate);
131+ float min_wait_time = calculate_wait_time (min_buffer_size, sample_rate);
132+
133+ float max_possible_inferences = std::max (max_num_inferences (adjusted_config), max_num_inferences (host_config));
134+
135+ int inference_caused_latency_max_buffer = calculate_inference_caused_latency (max_possible_inferences, max_buffer_size, sample_rate, max_wait_time, m_inference_config.get_postprocess_output_size ()[i]);
136+ int inference_caused_latency_min_buffer = calculate_inference_caused_latency (1 , min_buffer_size, sample_rate, min_wait_time, m_inference_config.get_postprocess_output_size ()[i]);
137+ int inference_caused_latency_adjusted_buffer = calculate_inference_caused_latency (max_num_inferences (adjusted_config), adjusted_buffer_size, sample_rate, adjusted_wait_time, m_inference_config.get_postprocess_output_size ()[i]);
138+
139+ int inference_caused_latency = std::max ({inference_caused_latency_max_buffer, inference_caused_latency_adjusted_buffer, inference_caused_latency_min_buffer});
140+
124141 adjusted_latency.push_back (inference_caused_latency + buffer_adaptation);
125142 }
126143 }
@@ -249,7 +266,7 @@ std::vector<float> SessionElement::calculate_latency(const HostConfig& host_conf
249266 // Calculate the different parts of the latency
250267 int buffer_adaptation = calculate_buffer_adaptation (host_output_size, m_inference_config.get_postprocess_output_size ()[i]);
251268 float wait_time = calculate_wait_time (host_output_size, sample_rate);
252- int inference_caused_latency = calculate_inference_caused_latency (max_possible_inferences, host_output_size, sample_rate, wait_time);
269+ int inference_caused_latency = calculate_inference_caused_latency (max_possible_inferences, host_output_size, sample_rate, wait_time, m_inference_config. get_postprocess_output_size ()[i] );
253270 // Add it all together
254271 result_float.push_back (buffer_adaptation + inference_caused_latency);
255272 }
@@ -291,12 +308,45 @@ int SessionElement::calculate_buffer_adaptation(float host_buffer_size, int post
291308 return res;
292309}
293310
294- int SessionElement::calculate_inference_caused_latency (float max_possible_inferences, float host_buffer_size, float host_sample_rate, float wait_time) const {
311+ int SessionElement::calculate_inference_caused_latency (float max_possible_inferences, float host_buffer_size, float host_sample_rate, float wait_time, size_t postprocess_output_size ) const {
295312 // Calculate the host buffer time in ms
296313 float host_buffer_time = host_buffer_size * 1000 .f / host_sample_rate;
297- float total_inference_time_after_wait = (max_possible_inferences * m_inference_config.m_max_inference_time ) - wait_time;
298- float num_buffers_for_max_inferences = std::ceil (total_inference_time_after_wait / host_buffer_time);
299- return std::ceil (num_buffers_for_max_inferences * host_buffer_size);
314+ float inference_time_left = 0 .f ;
315+ int host_buffer_size_int = static_cast <int >(std::floor (host_buffer_size));
316+ float host_buffer_time_int = host_buffer_size_int * 1000 .f / host_sample_rate;
317+ int inference_caused_latency = 0 ;
318+
319+ unsigned int max_possible_inferences_parallel = static_cast <unsigned int >(std::ceil ((max_possible_inferences) / static_cast <float >(m_inference_config.m_num_parallel_processors )));
320+ int already_inferred = 0 ;
321+ float wait_time_left = wait_time;
322+ for (unsigned int i = 0 ; i < max_possible_inferences_parallel; ++i) {
323+ inference_time_left += m_inference_config.m_max_inference_time ;
324+ while (inference_time_left >= host_buffer_time_int && host_buffer_size_int > 0 ) {
325+ inference_caused_latency += host_buffer_size_int;
326+ inference_time_left -= host_buffer_time_int;
327+ wait_time_left += host_buffer_time_int;
328+ }
329+ }
330+
331+ while (inference_time_left > 0 ) {
332+ if (wait_time_left >= m_inference_config.m_max_inference_time ) {
333+ inference_time_left -= m_inference_config.m_max_inference_time ;
334+ already_inferred += m_inference_config.m_num_parallel_processors ;
335+ wait_time_left -= m_inference_config.m_max_inference_time ;
336+ } else {
337+ inference_caused_latency += host_buffer_size_int;
338+ if (host_buffer_time_int > 0 ) {
339+ inference_time_left -= host_buffer_time_int;
340+ } else {
341+ inference_caused_latency += 1 ;
342+ break ;
343+ }
344+ }
345+ }
346+
347+ inference_caused_latency -= already_inferred * postprocess_output_size;
348+
349+ return inference_caused_latency;
300350}
301351
302352float SessionElement::calculate_wait_time (float host_buffer_size, float host_sample_rate) const {
0 commit comments