Skip to content

Commit a1d5dc9

Browse files
infra: transform loopback netdev from tun to tap
In order to forward non IP frames to the kernel, a full ethernet frame is required. Remove the direct injection of packets to ip(6)_input nodes, and rely on the common nodes to direct the packets to the proper destination. loopback_ouput now behaves similarly to eth_output, prepending an ethernet header. Following kernel behavior, do not add an ip6 link local to the vrf iface. Signed-off-by: Christophe Fontaine <[email protected]>
1 parent 3dd7d94 commit a1d5dc9

File tree

10 files changed

+416
-389
lines changed

10 files changed

+416
-389
lines changed

docs/graph.svg

Lines changed: 325 additions & 331 deletions
Loading

modules/infra/control/gr_loopback.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,3 @@
99

1010
void loopback_tx(struct rte_mbuf *m);
1111
control_input_t loopback_get_control_id(void);
12-
void loopback_input_add_type(rte_be16_t eth_type, const char *next_node);

modules/infra/control/loopback.c

Lines changed: 59 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
// SPDX-License-Identifier: BSD-3-Clause
22
// Copyright (c) 2024 Christophe Fontaine
33

4+
#include <gr_config.h>
45
#include <gr_control_input.h>
56
#include <gr_control_output.h>
67
#include <gr_eth.h>
@@ -14,6 +15,7 @@
1415
#include <event2/event.h>
1516
#include <rte_errno.h>
1617
#include <rte_malloc.h>
18+
#include <rte_net.h>
1719

1820
#include <fcntl.h>
1921
#include <linux/if_tun.h>
@@ -32,6 +34,7 @@ static struct event_base *ev_base;
3234
struct iface_info_loopback {
3335
int fd;
3436
struct event *ev;
37+
struct rte_ether_addr mac;
3538
};
3639

3740
static void finalize_fd(struct event *ev, void * /*priv*/) {
@@ -40,14 +43,26 @@ static void finalize_fd(struct event *ev, void * /*priv*/) {
4043
close(fd);
4144
}
4245

46+
static int loopback_mac_get(const struct iface *iface, struct rte_ether_addr *mac) {
47+
struct iface_info_loopback *lo = (struct iface_info_loopback *)iface->info;
48+
*mac = lo->mac;
49+
return 0;
50+
}
51+
4352
void loopback_tx(struct rte_mbuf *m) {
4453
struct mbuf_data *d = mbuf_data(m);
4554
struct iface_info_loopback *lo;
46-
struct iovec iov[2];
47-
struct tun_pi pi;
48-
char *data;
55+
struct rte_ether_hdr *eth;
56+
struct iface_stats *stats;
57+
char *data = NULL;
4958

5059
lo = (struct iface_info_loopback *)d->iface->info;
60+
61+
eth = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
62+
if (rte_is_broadcast_ether_addr(&eth->dst_addr) == 0)
63+
loopback_mac_get(d->iface, &eth->dst_addr);
64+
loopback_mac_get(d->iface, &eth->src_addr);
65+
5166
if (rte_pktmbuf_linearize(m) == 0) {
5267
data = rte_pktmbuf_mtod(m, char *);
5368
} else {
@@ -60,42 +75,41 @@ void loopback_tx(struct rte_mbuf *m) {
6075
// to the user provided buffer.
6176
rte_pktmbuf_read(m, 0, rte_pktmbuf_pkt_len(m), data);
6277
}
63-
pi.flags = 0;
64-
if ((data[0] & 0xf0) == 0x40)
65-
pi.proto = RTE_BE16(RTE_ETHER_TYPE_IPV4);
66-
else if ((data[0] & 0xf0) == 0x60)
67-
pi.proto = RTE_BE16(RTE_ETHER_TYPE_IPV6);
68-
else {
69-
LOG(ERR, "Bad proto: 0x%x - drop packet", data[0]);
70-
goto end;
71-
}
78+
7279
// Do not retry even in case of if EAGAIN || EWOULDBLOCK
7380
// If the tun device queue is full, something really bad is
7481
// already happening on the management plane side.
75-
iov[0].iov_base = &pi;
76-
iov[0].iov_len = sizeof(pi);
77-
iov[1].iov_base = data;
78-
iov[1].iov_len = rte_pktmbuf_pkt_len(m);
79-
80-
if (writev(lo->fd, iov, ARRAY_DIM(iov)) < 0) {
82+
if (write(lo->fd, data, rte_pktmbuf_pkt_len(m)) < 0) {
8183
// The user messed up and removed gr-loopX
8284
// release resources on our side to try to recover
8385
if (errno == EBADFD) {
8486
iface_destroy(d->iface->id);
8587
}
86-
LOG(ERR, "write to tun device failed %s", strerror(errno));
88+
LOG(ERR, "write to tap device failed %s", strerror(errno));
8789
}
8890

91+
stats = iface_get_stats(rte_lcore_id(), d->iface->id);
92+
stats->tx_packets += 1;
93+
stats->tx_bytes += rte_pktmbuf_pkt_len(m);
94+
95+
if (gr_config.log_packets)
96+
trace_log_packet(m, "tx", d->iface->name);
97+
98+
// TODO: add a trace for that fake node
99+
if (gr_mbuf_is_traced(m))
100+
gr_mbuf_trace_finish(m);
89101
end:
90102
if (!rte_pktmbuf_is_contiguous(m))
91103
rte_free(data);
92104
rte_pktmbuf_free(m);
93105
}
94106

95107
static void iface_loopback_poll(evutil_socket_t, short reason, void *ev_iface) {
96-
struct eth_input_mbuf_data *e;
97108
struct iface_info_loopback *lo;
98109
struct iface *iface = ev_iface;
110+
struct eth_input_mbuf_data *e;
111+
struct rte_ether_hdr *eth;
112+
struct iface_stats *stats;
99113
struct rte_mbuf *mbuf;
100114
size_t read_len;
101115
size_t len;
@@ -105,7 +119,7 @@ static void iface_loopback_poll(evutil_socket_t, short reason, void *ev_iface) {
105119

106120
if (reason & EV_CLOSED) {
107121
// The user messed up and removed gr-loopX
108-
LOG(ERR, "tun device %s deleted", iface->name);
122+
LOG(ERR, "tap device %s deleted", iface->name);
109123
iface_destroy(iface->id);
110124
return;
111125
}
@@ -116,7 +130,7 @@ static void iface_loopback_poll(evutil_socket_t, short reason, void *ev_iface) {
116130
goto err;
117131
}
118132

119-
read_len = iface->mtu + sizeof(struct tun_pi);
133+
read_len = iface->mtu;
120134
if ((data = rte_pktmbuf_append(mbuf, read_len)) == NULL) {
121135
LOG(ERR, "rte_pktmbuf_alloc %s", rte_strerror(rte_errno));
122136
goto err;
@@ -130,20 +144,28 @@ static void iface_loopback_poll(evutil_socket_t, short reason, void *ev_iface) {
130144
}
131145

132146
rte_pktmbuf_trim(mbuf, read_len - len);
147+
eth = rte_pktmbuf_mtod(mbuf, struct rte_ether_hdr *);
148+
149+
if (rte_is_broadcast_ether_addr(&eth->dst_addr) == 0)
150+
loopback_mac_get(iface, &eth->dst_addr);
133151

134152
// packet sent from linux tun iface, no need to compute checksum;
135153
mbuf->ol_flags = RTE_MBUF_F_RX_IP_CKSUM_GOOD;
154+
mbuf->packet_type = rte_net_get_ptype(mbuf, NULL, RTE_PTYPE_ALL_MASK);
136155

137-
// We can't call rte_net_get_ptype directly as we do not have an ethernet frame.
138-
// An option would be to prepend/adjust every buffer, but let's set directly
139-
// the information we need instead.
140-
mbuf->packet_type = data[0] == 6 ? RTE_PTYPE_L3_IPV6 : RTE_PTYPE_L3_IPV4;
141-
142-
// required by ip(6)_input
143156
e = eth_input_mbuf_data(mbuf);
144157
e->iface = iface;
145158
e->domain = ETH_DOMAIN_LOOPBACK;
146159

160+
stats = iface_get_stats(rte_lcore_id(), iface->id);
161+
stats->rx_packets += 1;
162+
stats->rx_bytes += rte_pktmbuf_pkt_len(mbuf);
163+
164+
if (gr_config.log_packets)
165+
trace_log_packet(mbuf, "rx", iface->name);
166+
167+
// TODO: add trace for that fake node
168+
147169
post_to_stack(loopback_get_control_id(), mbuf);
148170
return;
149171

@@ -175,7 +197,7 @@ static int iface_loopback_init(struct iface *iface, const void * /* api_info */)
175197

176198
memset(&ifr, 0, sizeof(struct ifreq));
177199
memccpy(ifr.ifr_name, iface->name, 0, IFNAMSIZ);
178-
ifr.ifr_flags = IFF_TUN | IFF_POINTOPOINT | IFF_ONE_QUEUE;
200+
ifr.ifr_flags = IFF_TAP | IFF_ONE_QUEUE | IFF_NO_PI;
179201

180202
if ((ioctl_sock = socket(AF_INET, SOCK_DGRAM, 0)) < 0) {
181203
LOG(ERR, "socket(SOCK_DGRAM): %s", strerror(errno));
@@ -209,12 +231,18 @@ static int iface_loopback_init(struct iface *iface, const void * /* api_info */)
209231
goto err;
210232
}
211233

212-
ifr.ifr_flags |= IFF_UP;
234+
ifr.ifr_flags |= IFF_UP | IFF_NOARP;
213235
if (ioctl(ioctl_sock, SIOCSIFFLAGS, &ifr) < 0) {
214236
LOG(ERR, "ioctl(SIOCSIFFLAGS): %s", strerror(errno));
215237
goto err;
216238
}
217239

240+
if (ioctl(ioctl_sock, SIOCGIFHWADDR, &ifr) < 0) {
241+
LOG(ERR, "ioctl(SIOCGIFHWADDR) %s", strerror(errno));
242+
goto err;
243+
}
244+
memcpy(&lo->mac, ifr.ifr_hwaddr.sa_data, sizeof(lo->mac));
245+
218246
iface->flags = GR_IFACE_F_UP;
219247
iface->state = GR_IFACE_S_RUNNING;
220248
lo->ev = event_new(
@@ -267,6 +295,7 @@ static struct iface_type iface_type_loopback = {
267295
.init = iface_loopback_init,
268296
.fini = iface_loopback_fini,
269297
.to_api = iface_loopback_to_api,
298+
.get_eth_addr = loopback_mac_get,
270299
};
271300

272301
static struct gr_module loopback_module = {

modules/infra/datapath/eth_input.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,9 @@ eth_input_process(struct rte_graph *graph, struct rte_node *node, void **objs, u
9090
stats->rx_packets += 1;
9191
stats->rx_bytes += rte_pktmbuf_pkt_len(m);
9292

93+
if (unlikely(eth_in->domain == ETH_DOMAIN_LOOPBACK))
94+
goto next;
95+
9396
if (unlikely(rte_is_multicast_ether_addr(&eth->dst_addr))) {
9497
if (rte_is_broadcast_ether_addr(&eth->dst_addr))
9598
eth_in->domain = ETH_DOMAIN_BROADCAST;

modules/infra/datapath/loop_input.c

Lines changed: 4 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -18,43 +18,27 @@ control_input_t loopback_get_control_id(void) {
1818
}
1919

2020
enum {
21-
UNKNOWN_PROTO = 0,
21+
ETH = 0,
2222
EDGE_COUNT,
2323
};
2424

25-
static rte_edge_t l3_edges[1 << 16] = {UNKNOWN_PROTO};
26-
27-
void loopback_input_add_type(rte_be16_t eth_type, const char *next_node) {
28-
LOG(DEBUG, "loopback_input: type=0x%04x -> %s", rte_be_to_cpu_16(eth_type), next_node);
29-
if (l3_edges[eth_type] != UNKNOWN_PROTO)
30-
ABORT("next node already registered for ether type=0x%04x",
31-
rte_be_to_cpu_16(eth_type));
32-
l3_edges[eth_type] = gr_node_attach_parent("loopback_input", next_node);
33-
}
34-
3525
static uint16_t loopback_input_process(
3626
struct rte_graph *graph,
3727
struct rte_node *node,
3828
void **objs,
3929
uint16_t nb_objs
4030
) {
4131
struct rte_mbuf *mbuf;
42-
rte_be16_t eth_type;
43-
rte_edge_t edge;
4432

4533
for (uint16_t i = 0; i < nb_objs; i++) {
4634
mbuf = objs[i];
47-
4835
if (gr_mbuf_is_traced(mbuf)
4936
|| mbuf_data(mbuf)->iface->flags & GR_IFACE_F_PACKET_TRACE) {
5037
gr_mbuf_trace_add(mbuf, node, 0);
5138
}
52-
53-
eth_type = rte_pktmbuf_mtod(mbuf, struct tun_pi *)->proto;
54-
rte_pktmbuf_adj(mbuf, sizeof(struct tun_pi));
55-
edge = l3_edges[eth_type];
56-
rte_node_enqueue_x1(graph, node, edge, mbuf);
5739
}
40+
rte_node_enqueue(graph, node, ETH, objs, nb_objs);
41+
5842
return nb_objs;
5943
}
6044

@@ -63,7 +47,7 @@ static struct rte_node_register loopback_input_node = {
6347
.process = loopback_input_process,
6448
.nb_edges = EDGE_COUNT,
6549
.next_nodes = {
66-
[UNKNOWN_PROTO] = "loopback_unknown_proto",
50+
[ETH] = "eth_input",
6751
},
6852
};
6953

@@ -77,4 +61,3 @@ static struct gr_node_info info = {
7761
};
7862

7963
GR_NODE_REGISTER(info);
80-
GR_DROP_REGISTER(loopback_unknown_proto);

modules/infra/datapath/loop_output.c

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
// Copyright (c) 2024 Christophe Fontaine
33

44
#include <gr_control_output.h>
5+
#include <gr_eth.h>
56
#include <gr_graph.h>
67
#include <gr_infra.h>
78
#include <gr_ip4_datapath.h>
@@ -11,6 +12,7 @@
1112

1213
enum {
1314
CONTROL_OUTPUT,
15+
NO_HEADROOM,
1416
EDGE_COUNT,
1517
};
1618

@@ -20,14 +22,28 @@ static uint16_t loopback_output_process(
2022
void **objs,
2123
uint16_t nb_objs
2224
) {
25+
struct eth_output_mbuf_data *eth_data;
2326
struct control_output_mbuf_data *co;
27+
struct rte_ether_hdr *eth;
2428
struct rte_mbuf *mbuf;
29+
rte_edge_t edge;
2530

2631
for (uint16_t i = 0; i < nb_objs; i++) {
2732
mbuf = objs[i];
33+
edge = CONTROL_OUTPUT;
34+
eth_data = eth_output_mbuf_data(mbuf);
35+
eth = (struct rte_ether_hdr *)rte_pktmbuf_prepend(mbuf, sizeof(*eth));
36+
if (eth == NULL) {
37+
edge = NO_HEADROOM;
38+
goto next;
39+
}
40+
41+
eth->ether_type = eth_data->ether_type;
42+
2843
co = control_output_mbuf_data(mbuf);
2944
co->callback = loopback_tx;
30-
rte_node_enqueue_x1(graph, node, CONTROL_OUTPUT, mbuf);
45+
next:
46+
rte_node_enqueue_x1(graph, node, edge, mbuf);
3147
}
3248
return nb_objs;
3349
}
@@ -38,6 +54,7 @@ static struct rte_node_register loopback_output_node = {
3854
.nb_edges = EDGE_COUNT,
3955
.next_nodes = {
4056
[CONTROL_OUTPUT] = "control_output",
57+
[NO_HEADROOM] = "error_no_headroom",
4158
},
4259
};
4360

modules/ip/datapath/ip_input.c

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,6 @@ ip_input_process(struct rte_graph *graph, struct rte_node *node, void **objs, ui
158158

159159
static void ip_input_register(void) {
160160
gr_eth_input_add_type(RTE_BE16(RTE_ETHER_TYPE_IPV4), "ip_input");
161-
loopback_input_add_type(RTE_BE16(RTE_ETHER_TYPE_IPV4), "ip_input");
162161
}
163162

164163
static struct rte_node_register input_node = {
@@ -204,7 +203,6 @@ mock_func(uint16_t, drop_packets(struct rte_graph *, struct rte_node *, void **,
204203
mock_func(int, drop_format(char *, size_t, const void *, size_t));
205204
mock_func(int, trace_ip_format(char *, size_t, const struct rte_ipv4_hdr *, size_t));
206205
mock_func(void, gr_eth_input_add_type(rte_be16_t, const char *));
207-
mock_func(void, loopback_input_add_type(rte_be16_t, const char *));
208206

209207
struct fake_mbuf {
210208
struct rte_ipv4_hdr ipv4_hdr;

modules/ip6/control/address.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -323,6 +323,8 @@ static void ip6_iface_event_handler(uint32_t event, const void *obj) {
323323

324324
switch (event) {
325325
case GR_EVENT_IFACE_POST_ADD:
326+
if (iface->type == GR_IFACE_TYPE_LOOPBACK)
327+
break;
326328
if (iface_get_eth_addr(iface->id, &mac) == 0) {
327329
rte_ipv6_llocal_from_ethernet(&link_local, &mac);
328330
if (iface6_addr_add(iface, &link_local, 64) < 0)

modules/ip6/datapath/ip6_input.c

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,6 @@ ip6_input_process(struct rte_graph *graph, struct rte_node *node, void **objs, u
151151

152152
static void ip6_input_register(void) {
153153
gr_eth_input_add_type(RTE_BE16(RTE_ETHER_TYPE_IPV6), "ip6_input");
154-
loopback_input_add_type(RTE_BE16(RTE_ETHER_TYPE_IPV6), "ip6_input");
155154
}
156155

157156
static struct rte_node_register input_node = {
@@ -200,7 +199,6 @@ mock_func(uint16_t, drop_packets(struct rte_graph *, struct rte_node *, void **,
200199
mock_func(int, drop_format(char *, size_t, const void *, size_t));
201200
mock_func(int, trace_ip6_format(char *, size_t, const struct rte_ipv6_hdr *, size_t));
202201
mock_func(void, gr_eth_input_add_type(rte_be16_t, const char *));
203-
mock_func(void, loopback_input_add_type(rte_be16_t, const char *));
204202
mock_func(struct nexthop *, mcast6_get_member(uint16_t, const struct rte_ipv6_addr *));
205203

206204
struct fake_mbuf {

0 commit comments

Comments
 (0)