Skip to content

Commit b7e0ca4

Browse files
author
Liran Oz
committed
issue: 1050049 Move poll_os logic to the internal thread
By default VMA checks the os fd once for 100 cq polls (VMA_RX_UDP_POLL_OS_RATIO) in order to receive UDP packets with size > MTU. This commit moves this logic to the internal thread which uses epoll_wait() to indicates that non-offloaded data is available. This commit removes VMA_RX_UDP_POLL_OS_RATIO parameter. Signed-off-by: Liran Oz <lirano@mellanox.com>
1 parent d2c8f24 commit b7e0ca4

File tree

8 files changed

+85
-122
lines changed

8 files changed

+85
-122
lines changed

README.txt

Lines changed: 4 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
Update: 13 Sep 2017
1+
Update: 17 Sep 2017
22

33
Introduction
44
============
@@ -132,7 +132,6 @@ Example:
132132
VMA DETAILS: Rx Byte Min Limit 65536 [VMA_RX_BYTES_MIN]
133133
VMA DETAILS: Rx Poll Loops 100000 [VMA_RX_POLL]
134134
VMA DETAILS: Rx Poll Init Loops 0 [VMA_RX_POLL_INIT]
135-
VMA DETAILS: Rx UDP Poll OS Ratio 100 [VMA_RX_UDP_POLL_OS_RATIO]
136135
VMA DETAILS: HW TS Conversion 3 [VMA_HW_TS_CONVERSION]
137136
VMA DETAILS: Rx SW CSUM 1 [VMA_RX_SW_CSUM]
138137
VMA DETAILS: Rx Poll Yield Disabled [VMA_RX_POLL_YIELD]
@@ -481,17 +480,6 @@ Once the first ADD_MEMBERSHIP is called the above VMA_RX_POLL takes effect.
481480
Value range is similar to the above VMA_RX_POLL
482481
Default value is 0
483482

484-
VMA_RX_UDP_POLL_OS_RATIO
485-
The above param will define the ratio between VMA CQ poll and OS FD poll.
486-
This will result in a single poll of the not-offloaded sockets every
487-
VMA_RX_UDP_POLL_OS_RATIO offloaded socket (CQ) polls. No matter if the CQ poll
488-
was a hit or miss. No matter if the socket is blocking or non-blocking.
489-
When disabled, only offloaded sockets are polled.
490-
This parameter replaces the two old parameters: VMA_RX_POLL_OS_RATIO and
491-
VMA_RX_SKIP_OS
492-
Disable with 0
493-
Default value is 100
494-
495483
VMA_HW_TS_CONVERSION
496484
The above param defines the time stamp conversion method.
497485
Experimental verbs is required for converting the time stamp from hardware time (Hz)
@@ -616,9 +604,9 @@ Disable with 0
616604
Default value is 10
617605

618606
VMA_SELECT_SKIP_OS
619-
Similar to VMA_RX_SKIP_OS, but in select() or poll() this will force the VMA
620-
to check the non offloaded fd even though an offloaded socket has ready
621-
packets found while polling.
607+
This will force the VMA to check the non offloaded fd even though an offloaded
608+
socket has ready packets found while polling.
609+
This will affect select() or poll().
622610
Default value is 4
623611

624612
VMA_PROGRESS_ENGINE_INTERVAL

src/vma/main.cpp

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -494,11 +494,6 @@ void print_vma_global_settings()
494494
VLOG_PARAM_NUMBER("Rx Byte Min Limit", safe_mce_sys().rx_ready_byte_min_limit, MCE_DEFAULT_RX_BYTE_MIN_LIMIT, SYS_VAR_RX_BYTE_MIN_LIMIT);
495495
VLOG_PARAM_NUMBER("Rx Poll Loops", safe_mce_sys().rx_poll_num, MCE_DEFAULT_RX_NUM_POLLS, SYS_VAR_RX_NUM_POLLS);
496496
VLOG_PARAM_NUMBER("Rx Poll Init Loops", safe_mce_sys().rx_poll_num_init, MCE_DEFAULT_RX_NUM_POLLS_INIT, SYS_VAR_RX_NUM_POLLS_INIT);
497-
if (safe_mce_sys().rx_udp_poll_os_ratio) {
498-
VLOG_PARAM_NUMBER("Rx UDP Poll OS Ratio", safe_mce_sys().rx_udp_poll_os_ratio, MCE_DEFAULT_RX_UDP_POLL_OS_RATIO, SYS_VAR_RX_UDP_POLL_OS_RATIO);
499-
} else {
500-
VLOG_PARAM_STRING("Rx UDP Poll OS Ratio", safe_mce_sys().rx_udp_poll_os_ratio, MCE_DEFAULT_RX_UDP_POLL_OS_RATIO, SYS_VAR_RX_UDP_POLL_OS_RATIO, "Disabled");
501-
}
502497

503498
VLOG_PARAM_NUMBER("HW TS Conversion", safe_mce_sys().hw_ts_conversion_mode, MCE_DEFAULT_HW_TS_CONVERSION_MODE, SYS_VAR_HW_TS_CONVERSION_MODE);
504499
VLOG_PARAM_NUMBER("Rx SW CSUM", safe_mce_sys().rx_sw_csum, MCE_DEFUALT_RX_SW_CSUM, SYS_VAR_RX_SW_CSUM);

src/vma/sock/fd_collection.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,10 +221,16 @@ inline cls* fd_collection::get(int fd, cls **map_type)
221221

222222
inline bool fd_collection::set_immediate_os_sample(int fd)
223223
{
224+
socket_fd_api* sock;
224225
epfd_info* epfd_fd;
225226

226227
auto_unlocker locker(*this);
227228

229+
if ((sock = get_sockfd(fd))){
230+
sock->set_immediate_os_sample();
231+
return true;
232+
}
233+
228234
if ((epfd_fd = get_epfd(fd))){
229235
epfd_fd->set_os_data_available();
230236
return true;

src/vma/sock/sockinfo_udp.cpp

Lines changed: 70 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -118,23 +118,24 @@ inline void sockinfo_udp::reuse_buffer(mem_buf_desc_t *buff)
118118
}
119119
}
120120

121-
inline int sockinfo_udp::poll_os()
121+
int sockinfo_udp::poll_and_arm_os()
122122
{
123-
int ret;
124123
uint64_t pending_data = 0;
125124

126-
m_rx_udp_poll_os_ratio_counter = 0;
127-
ret = orig_os_api.ioctl(m_fd, FIONREAD, &pending_data);
128-
if (unlikely(ret == -1)) {
125+
int ret = orig_os_api.ioctl(m_fd, FIONREAD, &pending_data);
126+
if (pending_data > 0) {
127+
return pending_data;
128+
}
129+
130+
// No data is available
131+
unset_immediate_os_sample();
132+
133+
if (ret < 0) {
129134
m_p_socket_stats->counters.n_rx_os_errors++;
130135
si_udp_logdbg("orig_os_api.ioctl returned with error in polling loop (errno=%d %m)", errno);
131-
return -1;
132-
}
133-
if (pending_data > 0) {
134-
m_p_socket_stats->counters.n_rx_poll_os_hit++;
135-
return 1;
136136
}
137-
return 0;
137+
138+
return ret;
138139
}
139140

140141
inline int sockinfo_udp::rx_wait(bool blocking)
@@ -145,31 +146,26 @@ inline int sockinfo_udp::rx_wait(bool blocking)
145146
epoll_event rx_epfd_events[SI_RX_EPFD_EVENT_MAX];
146147
uint64_t poll_sn;
147148

148-
m_loops_timer.start();
149+
m_loops_timer.start();
149150

150151
while (loops_to_go) {
151152

152153
// Multi-thread polling support - let other threads have a go on this CPU
153-
if ((m_n_sysvar_rx_poll_yield_loops > 0) && ((loops % m_n_sysvar_rx_poll_yield_loops) == (m_n_sysvar_rx_poll_yield_loops - 1))) {
154+
if ((m_n_sysvar_rx_poll_yield_loops > 0) && ((loops++ % m_n_sysvar_rx_poll_yield_loops) == (m_n_sysvar_rx_poll_yield_loops - 1))) {
154155
sched_yield();
155156
}
156157

157-
// Poll socket for OS ready packets... (at a ratio of the offloaded sockets as defined in m_n_sysvar_rx_udp_poll_os_ratio)
158-
if ((m_n_sysvar_rx_udp_poll_os_ratio > 0) && (m_rx_udp_poll_os_ratio_counter >= m_n_sysvar_rx_udp_poll_os_ratio)) {
159-
ret = poll_os();
160-
if ((ret == -1) || (ret == 1)) {
161-
return ret;
162-
}
158+
if (m_b_os_data_available) {
159+
// OS data might be available
160+
return 1;
163161
}
164162

165163
// Poll cq for offloaded ready packets ...
166-
m_rx_udp_poll_os_ratio_counter++;
167164
if (is_readable(&poll_sn)) {
168165
m_p_socket_stats->counters.n_rx_poll_hit++;
169166
return 0;
170167
}
171168

172-
loops++;
173169
if (!blocking || m_n_sysvar_rx_poll_num != -1) {
174170
loops_to_go--;
175171
}
@@ -287,7 +283,8 @@ inline int sockinfo_udp::rx_wait(bool blocking)
287283

288284
// Check if OS fd is ready for reading
289285
if (fd == m_fd) {
290-
m_rx_udp_poll_os_ratio_counter = 0;
286+
// OS data might be available
287+
set_immediate_os_sample();
291288
return 1;
292289
}
293290

@@ -377,7 +374,6 @@ sockinfo_udp::sockinfo_udp(int fd) throw (vma_exception) :
377374
,m_b_mc_tx_loop(safe_mce_sys().tx_mc_loopback_default) // default value is 'true'. User can change this with config parameter SYS_VAR_TX_MC_LOOPBACK
378375
,m_n_mc_ttl(DEFAULT_MC_TTL)
379376
,m_loops_to_go(safe_mce_sys().rx_poll_num_init) // Start up with a init polling loops value
380-
,m_rx_udp_poll_os_ratio_counter(0)
381377
,m_sock_offload(true)
382378
,m_mc_num_grp_with_src_filter(0)
383379
,m_port_map_lock("sockinfo_udp::m_ports_map_lock")
@@ -388,14 +384,14 @@ sockinfo_udp::sockinfo_udp(int fd) throw (vma_exception) :
388384
,m_b_rcvtstampns(false)
389385
,m_n_tsing_flags(0)
390386
,m_n_sysvar_rx_poll_yield_loops(safe_mce_sys().rx_poll_yield_loops)
391-
,m_n_sysvar_rx_udp_poll_os_ratio(safe_mce_sys().rx_udp_poll_os_ratio)
392387
,m_n_sysvar_rx_ready_byte_min_limit(safe_mce_sys().rx_ready_byte_min_limit)
393388
,m_n_sysvar_rx_cq_drain_rate_nsec(safe_mce_sys().rx_cq_drain_rate_nsec)
394389
,m_n_sysvar_rx_delta_tsc_between_cq_polls(safe_mce_sys().rx_delta_tsc_between_cq_polls)
395390
,m_reuseaddr(false)
396391
,m_sockopt_mapped(false)
397392
,m_is_connected(false)
398393
,m_multicast(false)
394+
,m_b_os_data_available(false)
399395
{
400396
si_udp_logfunc("");
401397

@@ -417,17 +413,17 @@ sockinfo_udp::sockinfo_udp(int fd) throw (vma_exception) :
417413
rx_ready_byte_count_limit_update(n_so_rcvbuf_bytes);
418414

419415
epoll_event ev = {0, {0}};
420-
421416
ev.events = EPOLLIN;
422-
423-
// Add the user's orig fd to the rx epfd handle
424-
ev.data.fd = m_fd;
417+
ev.data.fd = m_fd; // Add the user's orig fd to the rx epfd handle
425418

426419
BULLSEYE_EXCLUDE_BLOCK_START
427420
if (unlikely(orig_os_api.epoll_ctl(m_rx_epfd, EPOLL_CTL_ADD, ev.data.fd, &ev)))
428421
si_udp_logpanic("failed to add user's fd to internal epfd errno=%d (%m)", errno);
429422
BULLSEYE_EXCLUDE_BLOCK_END
430423

424+
// Register this socket to read nonoffloaded data
425+
g_p_event_handler_manager->update_epfd(m_fd, EPOLL_CTL_ADD, EPOLLIN | EPOLLPRI | EPOLLONESHOT);
426+
431427
si_udp_logfunc("done");
432428
}
433429

@@ -1415,30 +1411,25 @@ ssize_t sockinfo_udp::rx(const rx_call_t call_type, iovec* p_iov,ssize_t sz_iov,
14151411

14161412
return_reuse_buffers_postponed();
14171413

1418-
// Drop lock to not starve other threads
1419-
m_lock_rcv.unlock();
1420-
1421-
// Poll socket for OS ready packets... (at a ratio of the offloaded sockets as defined in m_n_sysvar_rx_udp_poll_os_ratio)
1422-
if ((m_n_sysvar_rx_udp_poll_os_ratio > 0) && (m_rx_udp_poll_os_ratio_counter >= m_n_sysvar_rx_udp_poll_os_ratio)) {
1423-
ret = poll_os();
1424-
if (ret == -1) {
1425-
/* coverity[double_lock] TODO: RM#1049980 */
1426-
m_lock_rcv.lock();
1427-
goto out;
1428-
}
1429-
if (ret == 1) {
1430-
/* coverity[double_lock] TODO: RM#1049980 */
1431-
m_lock_rcv.lock();
1414+
// Check for nonoffloaded data if m_b_os_data_available is true
1415+
if (m_b_os_data_available) {
1416+
ret = poll_and_arm_os();
1417+
if (ret > 0) {
14321418
goto os;
14331419
}
1420+
if (unlikely(ret < 0)) {
1421+
goto out;
1422+
}
14341423
}
14351424

1425+
// Drop lock to not starve other threads
1426+
m_lock_rcv.unlock();
1427+
14361428
// First check if we have a packet in the ready list
14371429
if ((m_n_rx_pkt_ready_list_count > 0 && m_n_sysvar_rx_cq_drain_rate_nsec == MCE_RX_CQ_DRAIN_RATE_DISABLED)
14381430
|| is_readable(&poll_sn)) {
14391431
/* coverity[double_lock] TODO: RM#1049980 */
14401432
m_lock_rcv.lock();
1441-
m_rx_udp_poll_os_ratio_counter++;
14421433
if (m_n_rx_pkt_ready_list_count > 0) {
14431434
// Found a ready packet in the list
14441435
if (__msg) handle_cmsg(__msg);
@@ -1464,24 +1455,33 @@ ssize_t sockinfo_udp::rx(const rx_call_t call_type, iovec* p_iov,ssize_t sz_iov,
14641455
if (__msg) handle_cmsg(__msg);
14651456
ret = dequeue_packet(p_iov, sz_iov, (sockaddr_in *)__from, __fromlen, in_flags, &out_flags);
14661457
goto out;
1467-
} else {
1468-
m_lock_rcv.unlock();
1469-
goto wait;
14701458
}
1459+
m_lock_rcv.unlock();
1460+
goto wait;
14711461
}
1472-
else if (unlikely(rx_wait_ret < 0)) {
1462+
1463+
if (unlikely(rx_wait_ret < 0)) {
14731464
// Got < 0, means an error occurred
14741465
ret = rx_wait_ret;
14751466
goto out;
1476-
} // else - packet in OS
1467+
}
1468+
1469+
// Else, check for nonoffloaded data - rx_wait() returned 1.
1470+
ret = poll_and_arm_os();
1471+
if (ret == 0) {
1472+
m_lock_rcv.unlock();
1473+
goto wait;
1474+
}
1475+
if (unlikely(ret < 0)) {
1476+
goto out;
1477+
}
14771478

14781479
/*
14791480
* If we got here, either the socket is not offloaded or rx_wait() returned 1.
14801481
*/
14811482
os:
14821483
if (in_flags & MSG_VMA_ZCOPY_FORCE) {
14831484
// Enable the next non-blocked read to check the OS
1484-
m_rx_udp_poll_os_ratio_counter = m_n_sysvar_rx_udp_poll_os_ratio;
14851485
errno = EIO;
14861486
ret = -1;
14871487
goto out;
@@ -1495,10 +1495,9 @@ ssize_t sockinfo_udp::rx(const rx_call_t call_type, iovec* p_iov,ssize_t sz_iov,
14951495
ret = socket_fd_api::rx_os(call_type, p_iov, sz_iov, in_flags, __from, __fromlen, __msg);
14961496
*p_flags = in_flags;
14971497
save_stats_rx_os(ret);
1498-
if (ret > 0) {
1499-
// This will cause the next non-blocked read to check the OS again.
1500-
// We do this only after a successful read.
1501-
m_rx_udp_poll_os_ratio_counter = m_n_sysvar_rx_udp_poll_os_ratio;
1498+
if (ret <= 0) {
1499+
// Do not poll the os fd on the next rx() call.
1500+
unset_immediate_os_sample();
15021501
}
15031502

15041503
out:
@@ -1633,16 +1632,16 @@ void sockinfo_udp::handle_cmsg(struct msghdr * msg)
16331632
cm_state.mhdr->msg_controllen = cm_state.cmsg_bytes_consumed;
16341633
}
16351634

1636-
// This function is relevant only for non-blocking socket
16371635
void sockinfo_udp::set_immediate_os_sample()
16381636
{
1639-
m_rx_udp_poll_os_ratio_counter = m_n_sysvar_rx_udp_poll_os_ratio;
1637+
m_b_os_data_available = true;
16401638
}
16411639

1642-
// This function is relevant only for non-blocking socket
16431640
void sockinfo_udp::unset_immediate_os_sample()
16441641
{
1645-
m_rx_udp_poll_os_ratio_counter = 0;
1642+
// Reassign EPOLLIN event
1643+
m_b_os_data_available = false;
1644+
g_p_event_handler_manager->update_epfd(m_fd, EPOLL_CTL_MOD, EPOLLIN | EPOLLPRI | EPOLLONESHOT);
16461645
}
16471646

16481647
bool sockinfo_udp::is_readable(uint64_t *p_poll_sn, fd_array_t* p_fd_ready_array)
@@ -1952,7 +1951,6 @@ int sockinfo_udp::rx_verify_available_data()
19521951
if (ret >= 0) {
19531952
// This will cause the next non-blocked read to check the OS again.
19541953
// We do this only after a successful read.
1955-
m_rx_udp_poll_os_ratio_counter = m_n_sysvar_rx_udp_poll_os_ratio;
19561954
ret = pending_data;
19571955
}
19581956
}
@@ -2297,12 +2295,12 @@ void sockinfo_udp::rx_add_ring_cb(flow_tuple_with_local_if &flow_key, ring* p_ri
22972295
si_udp_logdbg("");
22982296
sockinfo::rx_add_ring_cb(flow_key, p_ring, is_migration);
22992297

2300-
//Now that we got at least 1 CQ attached enable the skip os mechanism.
2301-
m_rx_udp_poll_os_ratio_counter = m_n_sysvar_rx_udp_poll_os_ratio;
2298+
// Now that we got at least 1 CQ attached enable poll os for available data.
2299+
set_immediate_os_sample();
23022300

23032301
// Now that we got at least 1 CQ attached start polling the CQs
23042302
if (m_b_blocking) {
2305-
m_loops_to_go = m_n_sysvar_rx_poll_num;
2303+
m_loops_to_go = m_n_sysvar_rx_poll_num;
23062304
}
23072305
else {
23082306
m_loops_to_go = 1; // Force single CQ poll in case of non-blocking socket
@@ -2742,24 +2740,30 @@ size_t sockinfo_udp::handle_msg_trunc(size_t total_rx, size_t payload_size, int
27422740
return total_rx;
27432741
}
27442742

2745-
mem_buf_desc_t* sockinfo_udp::get_front_m_rx_pkt_ready_list(){
2743+
mem_buf_desc_t* sockinfo_udp::get_front_m_rx_pkt_ready_list()
2744+
{
27462745
return m_rx_pkt_ready_list.front();
27472746
}
27482747

2749-
size_t sockinfo_udp::get_size_m_rx_pkt_ready_list(){
2748+
size_t sockinfo_udp::get_size_m_rx_pkt_ready_list()
2749+
{
27502750
return m_rx_pkt_ready_list.size();
27512751
}
27522752

2753-
void sockinfo_udp::pop_front_m_rx_pkt_ready_list(){
2753+
void sockinfo_udp::pop_front_m_rx_pkt_ready_list()
2754+
{
27542755
m_rx_pkt_ready_list.pop_front();
27552756
}
27562757

2757-
void sockinfo_udp::push_back_m_rx_pkt_ready_list(mem_buf_desc_t* buff){
2758+
void sockinfo_udp::push_back_m_rx_pkt_ready_list(mem_buf_desc_t* buff)
2759+
{
27582760
m_rx_pkt_ready_list.push_back(buff);
27592761
}
27602762

2761-
bool sockinfo_udp::prepare_to_close(bool process_shutdown) {
2763+
bool sockinfo_udp::prepare_to_close(bool process_shutdown)
2764+
{
27622765
m_lock_rcv.lock();
2766+
g_p_event_handler_manager->update_epfd(m_fd, EPOLL_CTL_DEL, EPOLLIN | EPOLLPRI | EPOLLONESHOT);
27632767
do_wakeup();
27642768
m_lock_rcv.unlock();
27652769
NOT_IN_USE(process_shutdown);

0 commit comments

Comments
 (0)