Skip to content

Commit c7379c0

Browse files
author
Liran Oz
committed
issue: 1117626 Move epoll poll_os logic to the internal thread
This commit is not based on udp poll os commit (pr Mellanox#445). Fix review comments Remove boolean var from io_mux_call::handle_os_countdown() * 6a0b53a issue: 1117626 Move epoll poll_os logic to the internal thread * b1de6f5 issue: 1117626 Move epoll poll_os logic to the internal thread * f0adf45 issue: 1117626 Move epoll poll_os logic to the internal thread Signed-off-by: Liran Oz <lirano@mellanox.com>
1 parent e96aa75 commit c7379c0

File tree

12 files changed

+185
-89
lines changed

12 files changed

+185
-89
lines changed

README.txt

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
Update: 03 Sep 2017
1+
Update: 13 Sep 2017
22

33
Introduction
44
============
@@ -607,18 +607,18 @@ Default value is 0
607607

608608
VMA_SELECT_POLL_OS_RATIO
609609
This will enable polling of the OS file descriptors while user thread calls
610-
select(), poll() or epoll_wait() and the VMA is busy in the offloaded sockets
611-
polling loop. This will result in a signle poll of the not-offloaded sockets
612-
every VMA_SELECT_POLL_RATIO offlaoded sockets (CQ) polls.
610+
select() or poll() and the VMA is busy in the offloaded sockets polling loop.
611+
This will result in a signle poll of the not-offloaded sockets every
612+
VMA_SELECT_POLL_RATIO offlaoded sockets (CQ) polls.
613613
When disabled, only offlaoded sockets are polled.
614614
(See VMA_SELECT_POLL for more info)
615615
Disable with 0
616616
Default value is 10
617617

618618
VMA_SELECT_SKIP_OS
619-
Similar to VMA_RX_SKIP_OS, but in select(), poll() or epoll_wait() this will
620-
force the VMA to check the non offloaded fd even though an offloaded socket
621-
has ready packets found while polling.
619+
Similar to VMA_RX_SKIP_OS, but in select() or poll() this will force the VMA
620+
to check the non offloaded fd even though an offloaded socket has ready
621+
packets found while polling.
622622
Default value is 4
623623

624624
VMA_PROGRESS_ENGINE_INTERVAL

src/vma/event/event_handler_manager.cpp

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
#include <rdma/rdma_cma.h>
3838
#include <vma/dev/net_device_table_mgr.h>
3939
#include "vma/dev/ring_allocation_logic.h"
40+
#include "vma/sock/fd_collection.h"
4041
#include "vma/sock/sock-redirect.h" // calling orig_os_api.epoll()
4142
#include "vma/util/verbs_extra.h"
4243

@@ -395,11 +396,11 @@ void event_handler_manager::stop_thread()
395396
m_epfd = -1;
396397
}
397398

398-
void event_handler_manager::update_epfd(int fd, int operation)
399+
void event_handler_manager::update_epfd(int fd, int operation, int events)
399400
{
400401
epoll_event ev = {0, {0}};
401402

402-
ev.events = EPOLLIN | EPOLLPRI;
403+
ev.events = events;
403404
ev.data.fd = fd;
404405
BULLSEYE_EXCLUDE_BLOCK_START
405406
if (orig_os_api.epoll_ctl(m_epfd, operation, fd, &ev) < 0) {
@@ -520,7 +521,7 @@ void event_handler_manager::priv_register_ibverbs_events(ibverbs_reg_info_t& inf
520521

521522
priv_prepare_ibverbs_async_event_queue(i);
522523

523-
update_epfd(info.fd, EPOLL_CTL_ADD);
524+
update_epfd(info.fd, EPOLL_CTL_ADD, EPOLLIN | EPOLLPRI);
524525
evh_logdbg("%d added to event_handler_map_t!", info.fd);
525526
}
526527
BULLSEYE_EXCLUDE_BLOCK_START
@@ -582,7 +583,7 @@ void event_handler_manager::priv_unregister_ibverbs_events(ibverbs_reg_info_t& i
582583

583584
i->second.ibverbs_ev.ev_map.erase(j);
584585
if (n == 1) {
585-
update_epfd(info.fd, EPOLL_CTL_DEL);
586+
update_epfd(info.fd, EPOLL_CTL_DEL, EPOLLIN | EPOLLPRI);
586587
m_event_handler_map.erase(i);
587588
evh_logdbg("%d erased from event_handler_map_t!", info.fd);
588589
}
@@ -607,7 +608,7 @@ void event_handler_manager::priv_register_rdma_cm_events(rdma_cm_reg_info_t& inf
607608
/* cppcheck-suppress uninitStructMember */
608609
m_event_handler_map[info.fd] = map_value;
609610

610-
update_epfd(info.fd, EPOLL_CTL_ADD);
611+
update_epfd(info.fd, EPOLL_CTL_ADD, EPOLLIN | EPOLLPRI);
611612
}
612613
else {
613614
BULLSEYE_EXCLUDE_BLOCK_START
@@ -651,7 +652,7 @@ void event_handler_manager::priv_unregister_rdma_cm_events(rdma_cm_reg_info_t& i
651652
iter_fd->second.rdma_cm_ev.map_rdma_cm_id.erase(iter_id);
652653
iter_fd->second.rdma_cm_ev.n_ref_count--;
653654
if (iter_fd->second.rdma_cm_ev.n_ref_count == 0) {
654-
update_epfd(info.fd, EPOLL_CTL_DEL);
655+
update_epfd(info.fd, EPOLL_CTL_DEL, EPOLLIN | EPOLLPRI);
655656
m_event_handler_map.erase(iter_fd);
656657
evh_logdbg("Removed channel <%d %p>", info.fd, info.id);
657658
}
@@ -679,7 +680,7 @@ void event_handler_manager::priv_register_command_events(command_reg_info_t& inf
679680
/* coverity[uninit_use_in_call] */
680681
/* cppcheck-suppress uninitStructMember */
681682
m_event_handler_map[info.fd] = map_value;
682-
update_epfd(info.fd, EPOLL_CTL_ADD);
683+
update_epfd(info.fd, EPOLL_CTL_ADD, EPOLLIN | EPOLLPRI);
683684
}
684685

685686
}
@@ -696,7 +697,7 @@ void event_handler_manager::priv_unregister_command_events(command_reg_info_t& i
696697
evh_logdbg(" This fd (%d) no longer COMMAND type fd", info.fd);
697698
}
698699
else {
699-
update_epfd(info.fd, EPOLL_CTL_DEL);
700+
update_epfd(info.fd, EPOLL_CTL_DEL, EPOLLIN | EPOLLPRI);
700701
}
701702
}
702703

@@ -954,7 +955,10 @@ void* event_handler_manager::thread_loop()
954955

955956
event_handler_map_t::iterator i = m_event_handler_map.find(fd);
956957
if (i == m_event_handler_map.end()) {
957-
evh_logdbg("No event handler (fd=%d)", fd);
958+
// No event handler - this is probably a poll_os event!
959+
if (!g_p_fd_collection->set_immediate_os_sample(fd)) {
960+
evh_logdbg("No event handler (fd=%d)", fd);
961+
}
958962
continue;
959963
}
960964

src/vma/event/event_handler_manager.h

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -127,15 +127,15 @@ struct reg_action_t {
127127

128128
typedef std::deque<struct reg_action_t> reg_action_q_t;
129129

130-
enum {
130+
enum ev_type {
131131
EV_IBVERBS,
132132
EV_RDMA_CM,
133133
EV_COMMAND,
134134
};
135135

136136

137137
struct event_data_t {
138-
int type;
138+
ev_type type;
139139
ibverbs_ev_t ibverbs_ev;
140140
rdma_cm_ev_t rdma_cm_ev;
141141
command_ev_t command_ev;
@@ -174,11 +174,13 @@ class event_handler_manager : public wakeup_pipe
174174
void* thread_loop();
175175
void stop_thread();
176176

177+
void update_epfd(int fd, int operation, int events);
178+
177179
private:
178180
pthread_t m_event_handler_tid;
179181
bool m_b_continue_running;
180182
int m_cq_epfd;
181-
int m_epfd;
183+
int m_epfd;
182184

183185
// pipe for the event registration handling
184186
reg_action_q_t m_reg_action_q;
@@ -209,7 +211,6 @@ class event_handler_manager : public wakeup_pipe
209211
void process_ibverbs_event(event_handler_map_t::iterator &i);
210212
void process_rdma_cm_event(event_handler_map_t::iterator &i);
211213
int start_thread();
212-
void update_epfd(int fd, int operation);
213214

214215
void event_channel_post_process_for_rdma_events(void* p_event);
215216
void* event_channel_pre_process_for_rdma_events(void* p_event_channel_handle, void** p_event);

src/vma/iomux/epfd_info.cpp

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,9 @@ int epfd_info::remove_fd_from_epoll_os(int fd)
5353
}
5454

5555
epfd_info::epfd_info(int epfd, int size) :
56-
lock_mutex_recursive("epfd_info"), m_epfd(epfd), m_size(size), m_ring_map_lock("epfd_ring_map_lock"), m_sysvar_thread_mode(safe_mce_sys().thread_mode)
56+
lock_mutex_recursive("epfd_info"), m_epfd(epfd), m_size(size), m_ring_map_lock("epfd_ring_map_lock"),
57+
m_lock_poll_os("epfd_lock_poll_os"), m_sysvar_thread_mode(safe_mce_sys().thread_mode),
58+
m_b_os_data_available(false)
5759
{
5860
__log_funcall("");
5961
int max_sys_fd = get_sys_max_fd_num();
@@ -82,6 +84,9 @@ epfd_info::epfd_info(int epfd, int size) :
8284

8385
vma_stats_instance_create_epoll_block(m_epfd, &(m_stats->stats));
8486

87+
// Register this socket to read nonoffloaded data
88+
g_p_event_handler_manager->update_epfd(m_epfd, EPOLL_CTL_ADD, EPOLLIN | EPOLLPRI | EPOLLONESHOT);
89+
8590
wakeup_set_epoll_fd(m_epfd);
8691
}
8792

@@ -120,6 +125,9 @@ epfd_info::~epfd_info()
120125
}
121126
BULLSEYE_EXCLUDE_BLOCK_END
122127
}
128+
129+
g_p_event_handler_manager->update_epfd(m_epfd, EPOLL_CTL_DEL, EPOLLIN | EPOLLPRI | EPOLLONESHOT);
130+
123131
unlock();
124132

125133
vma_stats_instance_remove_epoll_block(&m_stats->stats);
@@ -765,3 +773,26 @@ void epfd_info::statistics_print(vlog_levels_t log_level /* = VLOG_DEBUG */)
765773
}
766774
}
767775
}
776+
777+
void epfd_info::set_immediate_os_sample()
778+
{
779+
auto_unlocker locker(m_lock_poll_os);
780+
m_b_os_data_available = true;
781+
}
782+
783+
void epfd_info::unset_immediate_os_sample()
784+
{
785+
auto_unlocker locker(m_lock_poll_os);
786+
m_b_os_data_available = false;
787+
788+
// Reassign EPOLLIN event
789+
g_p_event_handler_manager->update_epfd(m_epfd, EPOLL_CTL_MOD, EPOLLIN | EPOLLPRI | EPOLLONESHOT);
790+
}
791+
792+
bool epfd_info::get_and_unset_os_data_available()
793+
{
794+
auto_unlocker locker(m_lock_poll_os);
795+
bool ret = m_b_os_data_available;
796+
m_b_os_data_available = false;
797+
return ret;
798+
}

src/vma/iomux/epfd_info.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,12 @@ class epfd_info : public lock_mutex_recursive, public cleanable_obj, public wake
102102

103103
void statistics_print(vlog_levels_t log_level = VLOG_DEBUG);
104104

105+
// Instructing the socket to immediately sample/un-sample the OS in receive flow
106+
void set_immediate_os_sample();
107+
void unset_immediate_os_sample();
108+
bool get_and_unset_os_data_available();
109+
inline bool get_os_data_available() {return m_b_os_data_available;}
110+
105111
static inline size_t epfd_info_node_offset(void) {return NODE_OFFSET(epfd_info, epfd_info_node);}
106112
list_node<epfd_info, epfd_info::epfd_info_node_offset> epfd_info_node;
107113

@@ -115,11 +121,13 @@ class epfd_info : public lock_mutex_recursive, public cleanable_obj, public wake
115121
fd_info_list_t m_fd_offloaded_list;
116122
ring_map_t m_ring_map;
117123
lock_mutex_recursive m_ring_map_lock;
124+
lock_spin m_lock_poll_os;
118125
const thread_mode_t m_sysvar_thread_mode;
119126
ready_cq_fd_q_t m_ready_cq_fd_q;
120127
epoll_stats_t m_local_stats;
121128
epoll_stats_t *m_stats;
122129
int m_log_invalid_events;
130+
bool m_b_os_data_available; // true when not offloaded data is available
123131

124132
int add_fd(int fd, epoll_event *event);
125133
int del_fd(int fd, bool passthrough = false);

src/vma/iomux/epoll_wait_call.cpp

Lines changed: 38 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -156,11 +156,6 @@ epoll_wait_call::~epoll_wait_call()
156156
{
157157
}
158158

159-
void epoll_wait_call::prepare_to_poll()
160-
{
161-
// Empty
162-
}
163-
164159
void epoll_wait_call::prepare_to_block()
165160
{
166161
// Empty
@@ -335,8 +330,9 @@ bool epoll_wait_call::check_all_offloaded_sockets(uint64_t *p_poll_sn)
335330
return m_n_all_ready_fds;
336331
}
337332

338-
bool epoll_wait_call::immidiate_return()
333+
bool epoll_wait_call::immidiate_return(int &poll_os_countdown)
339334
{
335+
NOT_IN_USE(poll_os_countdown);
340336
return false;
341337
}
342338

@@ -364,6 +360,42 @@ bool epoll_wait_call::handle_epoll_event(bool is_ready, uint32_t events, socket_
364360

365361
}
366362

363+
bool epoll_wait_call::handle_os_countdown(int &poll_os_countdown)
364+
{
365+
NOT_IN_USE(poll_os_countdown);
366+
367+
if (!m_epfd_info->get_os_data_available() || !m_epfd_info->get_and_unset_os_data_available()) {
368+
return false;
369+
}
370+
371+
/*
372+
* Poll OS when the internal thread found non offloaded data.
373+
*/
374+
bool cq_ready = wait_os(true);
375+
376+
m_epfd_info->unset_immediate_os_sample();
377+
378+
if (cq_ready) {
379+
// This will empty the cqepfd
380+
// (most likely in case of a wakeup and probably only under epoll_wait (Not select/poll))
381+
ring_wait_for_notification_and_process_element(&m_poll_sn, NULL);
382+
}
383+
/* Before we exit with ready OS fd's we'll check the CQs once more and exit
384+
* below after calling check_all_offloaded_sockets();
385+
* IMPORTANT : We cannot do an opposite with current code,
386+
* means we cannot poll cq and then poll os (for epoll) - because poll os
387+
* will delete ready offloaded fds.
388+
*/
389+
if (m_n_all_ready_fds) {
390+
m_p_stats->n_iomux_os_rx_ready += m_n_all_ready_fds; // TODO: fix it - we only know all counter, not read counter
391+
ring_poll_and_process_element(&m_poll_sn, NULL);
392+
check_all_offloaded_sockets(&m_poll_sn);
393+
return true;
394+
}
395+
396+
return false;
397+
}
398+
367399
int epoll_wait_call::ring_poll_and_process_element(uint64_t *p_poll_sn, void* pv_fd_ready_array/* = NULL*/)
368400
{
369401
return m_epfd_info->ring_poll_and_process_element(p_poll_sn, pv_fd_ready_array);

src/vma/iomux/epoll_wait_call.h

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -63,9 +63,6 @@ class epoll_wait_call : public io_mux_call
6363
/// @override
6464
virtual void set_offloaded_rfd_ready(int fd_index);
6565
virtual void set_offloaded_wfd_ready(int fd_index);
66-
67-
/// @override
68-
virtual void prepare_to_poll();
6966

7067
/// @override
7168
virtual void prepare_to_block();
@@ -95,7 +92,7 @@ class epoll_wait_call : public io_mux_call
9592
virtual void unlock();
9693

9794
/// @override
98-
virtual bool immidiate_return();
95+
virtual bool immidiate_return(int &poll_os_countdown);
9996

10097
/// @override
10198
virtual bool check_all_offloaded_sockets(uint64_t *p_poll_sn);
@@ -113,6 +110,8 @@ class epoll_wait_call : public io_mux_call
113110

114111
virtual int ring_wait_for_notification_and_process_element(uint64_t *p_poll_sn, void* pv_fd_ready_array = NULL);
115112

113+
virtual bool handle_os_countdown(int &poll_os_countdown);
114+
116115
private:
117116
bool _wait(int timeout);
118117

0 commit comments

Comments
 (0)