Skip to content

Commit f529806

Browse files
committed
Add IPvlan L2 driver for Alibaba Cloud HPN nodes
Implement IPvlan-based networking for Alibaba Cloud HPN nodes, enabling non-hostNetwork pods to use RoCE RDMA across nodes. Instead of moving bond interfaces into the Pod namespace, create IPvlan L2 slaves with derived IPv6 addresses (host prefix + pod IPv4). Key changes: - Add ACK cloud provider with HPN detection (IMDS + InfiniBand fallback) - Add IPVlanConfig to InterfaceConfig API for cloud provider signaling - PrepareResourceClaim: compute IPvlan slave configs with routes/neighbors - RunPodSandbox: create IPvlan slaves, assign IPv6, configure routes - StopPodSandbox: skip netdev/RDMA detach for IPvlan devices (kernel cleanup) - Use per-interface route metric (1024+linkIndex) to prevent RouteReplace from overwriting routes with the same destination across multiple bonds - Serialize concurrent IPvlan creation with mutex - Use runtime.LockOSThread for netns switching in setAddrGenMode - Gate IPvlan logic behind cloud-provider-hint=ALIBABA to prevent false activation on non-Alibaba nodes with InfiniBand devices Signed-off-by: hongqi.yu <yuhongqi.yhq@alibaba-inc.com>
1 parent 7dc9ab5 commit f529806

10 files changed

Lines changed: 1079 additions & 12 deletions

File tree

pkg/apis/types.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,13 @@ type InterfaceConfig struct {
8686
// If provided, the interface will be enslaved to a VRF device with this name.
8787
// This enables grouping multiple network interfaces into the same VRF.
8888
VRF *VRFConfig `json:"vrf,omitempty"`
89+
90+
// IPVlan, when set, instructs the driver to create an IPvlan slave interface
91+
// from the parent netdev instead of moving the host netdev into the Pod's
92+
// network namespace. The parent netdev remains in the host namespace.
93+
// This is used for HPN (High-Performance Networking) fabrics where the
94+
// parent netdev must retain its identity and address on the fabric.
95+
IPVlan *IPVlanConfig `json:"ipvlan,omitempty"`
8996
}
9097

9198
// VRFConfig represents the configuration for a Virtual Routing and Forwarding domain.
@@ -100,6 +107,17 @@ type VRFConfig struct {
100107
Table *int `json:"table,omitempty"`
101108
}
102109

110+
// IPVlanConfig specifies that the driver should create an IPvlan slave interface
111+
// rather than moving the host netdev into the Pod namespace.
112+
type IPVlanConfig struct {
113+
// Mode describes the addressing strategy for the IPvlan slave.
114+
// Supported values:
115+
// "ipv6" - derive Pod IPv6 from the parent's prefix (first 12 bytes)
116+
// combined with the Pod's IPv4 address (last 4 bytes).
117+
// The kernel IPvlan mode is L2 with BRIDGE flag.
118+
Mode string `json:"mode"`
119+
}
120+
103121
// RouteConfig represents a network route configuration.
104122
type RouteConfig struct {
105123
// Destination is the target network in CIDR format (e.g., "0.0.0.0/0", "10.0.0.0/8").
Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
/*
2+
Copyright The Kubernetes Authors
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
https://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package alibaba
18+
19+
import (
20+
"context"
21+
"fmt"
22+
"io"
23+
"net/http"
24+
"os"
25+
"strings"
26+
"time"
27+
28+
"k8s.io/apimachinery/pkg/util/wait"
29+
"k8s.io/klog/v2"
30+
31+
resourceapi "k8s.io/api/resource/v1"
32+
"sigs.k8s.io/dranet/pkg/apis"
33+
"sigs.k8s.io/dranet/pkg/cloudprovider"
34+
)
35+
36+
const (
37+
AlibabaAttrPrefix = "alibaba.dra.net"
38+
39+
AttrInstanceType = AlibabaAttrPrefix + "/" + "instanceType"
40+
41+
// Alibaba Cloud ECS Instance Metadata Service endpoint.
42+
imdsEndpoint = "http://100.100.100.200/latest"
43+
// IMDSv2 token endpoint and header.
44+
imdsTokenPath = "/api/token"
45+
imdsTokenTTL = "21600"
46+
)
47+
48+
var _ cloudprovider.CloudInstance = (*AlibabaInstance)(nil)
49+
50+
// AlibabaInstance holds Alibaba Cloud instance metadata relevant to network device configuration.
51+
type AlibabaInstance struct {
52+
InstanceType string
53+
IsHPN bool
54+
}
55+
56+
// OnAlibaba returns true if running on an Alibaba Cloud ECS instance.
57+
func OnAlibaba(ctx context.Context) bool {
58+
pollCtx, cancel := context.WithTimeout(ctx, 5*time.Second)
59+
defer cancel()
60+
return wait.PollUntilContextCancel(pollCtx, 1*time.Second, true, func(ctx context.Context) (bool, error) {
61+
token, err := fetchIMDSToken(ctx)
62+
if err != nil {
63+
return false, nil
64+
}
65+
req, err := http.NewRequestWithContext(ctx, http.MethodGet, imdsEndpoint+"/meta-data/instance-id", nil)
66+
if err != nil {
67+
return false, nil
68+
}
69+
req.Header.Set("X-aliyun-ecs-metadata-token", token)
70+
resp, err := http.DefaultClient.Do(req)
71+
if err != nil {
72+
return false, nil
73+
}
74+
defer resp.Body.Close()
75+
return resp.StatusCode == http.StatusOK, nil
76+
}) == nil
77+
}
78+
79+
// GetInstance retrieves Alibaba Cloud instance metadata via IMDS.
80+
func GetInstance(ctx context.Context) (cloudprovider.CloudInstance, error) {
81+
instanceType, err := queryIMDS(ctx, "/meta-data/instance/instance-type")
82+
if err != nil {
83+
klog.Infof("could not get Alibaba instance type: %v", err)
84+
}
85+
86+
isHPN := detectHPN(instanceType)
87+
klog.Infof("Alibaba Cloud instance: type=%q hpn=%v", instanceType, isHPN)
88+
89+
return &AlibabaInstance{
90+
InstanceType: instanceType,
91+
IsHPN: isHPN,
92+
}, nil
93+
}
94+
95+
// GetDeviceAttributes returns Alibaba Cloud-specific attributes for a device.
96+
func (a *AlibabaInstance) GetDeviceAttributes(id cloudprovider.DeviceIdentifiers) map[resourceapi.QualifiedName]resourceapi.DeviceAttribute {
97+
attributes := make(map[resourceapi.QualifiedName]resourceapi.DeviceAttribute)
98+
if a.InstanceType != "" {
99+
attributes[AttrInstanceType] = resourceapi.DeviceAttribute{StringValue: &a.InstanceType}
100+
}
101+
return attributes
102+
}
103+
104+
// GetDeviceConfig returns a NetworkConfig that signals IPvlan mode for
105+
// HPN bond devices. Non-HPN or non-bond devices return nil.
106+
func (a *AlibabaInstance) GetDeviceConfig(id cloudprovider.DeviceIdentifiers) *apis.NetworkConfig {
107+
if !a.IsHPN || !isHPNBondDevice(id.Name) {
108+
return nil
109+
}
110+
return &apis.NetworkConfig{
111+
Interface: apis.InterfaceConfig{
112+
IPVlan: &apis.IPVlanConfig{
113+
Mode: "ipv6",
114+
},
115+
},
116+
}
117+
}
118+
119+
// isHPNBondDevice returns true if the device name indicates it is an HPN
120+
// bond master interface (e.g. bond0, bond1). Bond slaves (reth*) and other
121+
// interfaces are not HPN devices.
122+
func isHPNBondDevice(name string) bool {
123+
return strings.HasPrefix(name, "bond")
124+
}
125+
126+
// detectHPN determines whether this instance is a HPN machine.
127+
// Only called when we already know we're on Alibaba Cloud (via IMDS
128+
// reachability or --cloud-provider-hint=ALIBABA).
129+
func detectHPN(instanceType string) bool {
130+
lower := strings.ToLower(instanceType)
131+
if strings.Contains(lower, "hpn") || strings.Contains(lower, "efg") {
132+
return true
133+
}
134+
// On confirmed Alibaba Cloud instances where IMDS didn't return instance type
135+
// (e.g. bare-metal), check for RDMA infiniband devices as HPN indicator.
136+
if instanceType == "" {
137+
if entries, err := os.ReadDir("/sys/class/infiniband"); err == nil && len(entries) > 0 {
138+
klog.V(2).Infof("Alibaba Cloud instance with %d infiniband devices, assuming HPN", len(entries))
139+
return true
140+
}
141+
}
142+
return false
143+
}
144+
145+
// fetchIMDSToken obtains a session token for IMDSv2.
146+
func fetchIMDSToken(ctx context.Context) (string, error) {
147+
req, err := http.NewRequestWithContext(ctx, http.MethodPut, imdsEndpoint+imdsTokenPath, nil)
148+
if err != nil {
149+
return "", err
150+
}
151+
req.Header.Set("X-aliyun-ecs-metadata-token-ttl-seconds", imdsTokenTTL)
152+
resp, err := http.DefaultClient.Do(req)
153+
if err != nil {
154+
return "", err
155+
}
156+
defer resp.Body.Close()
157+
if resp.StatusCode != http.StatusOK {
158+
return "", fmt.Errorf("IMDS token request returned %d", resp.StatusCode)
159+
}
160+
body, err := io.ReadAll(resp.Body)
161+
if err != nil {
162+
return "", err
163+
}
164+
return strings.TrimSpace(string(body)), nil
165+
}
166+
167+
// queryIMDS fetches a single metadata value from Alibaba Cloud IMDS (v2 with token).
168+
func queryIMDS(ctx context.Context, path string) (string, error) {
169+
var result string
170+
err := wait.PollUntilContextTimeout(ctx, 1*time.Second, 10*time.Second, true, func(ctx context.Context) (bool, error) {
171+
token, err := fetchIMDSToken(ctx)
172+
if err != nil {
173+
klog.V(4).Infof("IMDS token fetch failed: %v", err)
174+
return false, nil
175+
}
176+
req, err := http.NewRequestWithContext(ctx, http.MethodGet, imdsEndpoint+path, nil)
177+
if err != nil {
178+
return false, nil
179+
}
180+
req.Header.Set("X-aliyun-ecs-metadata-token", token)
181+
resp, err := http.DefaultClient.Do(req)
182+
if err != nil {
183+
klog.V(4).Infof("IMDS request to %s failed: %v", path, err)
184+
return false, nil
185+
}
186+
defer resp.Body.Close()
187+
if resp.StatusCode != http.StatusOK {
188+
return false, nil
189+
}
190+
body, err := io.ReadAll(resp.Body)
191+
if err != nil {
192+
return false, nil
193+
}
194+
result = strings.TrimSpace(string(body))
195+
return true, nil
196+
})
197+
return result, err
198+
}
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
/*
2+
Copyright The Kubernetes Authors
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
https://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package alibaba
18+
19+
import (
20+
"testing"
21+
22+
"sigs.k8s.io/dranet/pkg/cloudprovider"
23+
)
24+
25+
func TestGetDeviceAttributes(t *testing.T) {
26+
tests := []struct {
27+
name string
28+
instance AlibabaInstance
29+
wantInstType string
30+
}{
31+
{
32+
name: "HPN instance",
33+
instance: AlibabaInstance{
34+
InstanceType: "ecs.ebmhpn7.320xlarge",
35+
IsHPN: true,
36+
},
37+
wantInstType: "ecs.ebmhpn7.320xlarge",
38+
},
39+
{
40+
name: "regular ECS instance",
41+
instance: AlibabaInstance{
42+
InstanceType: "ecs.g7.xlarge",
43+
IsHPN: false,
44+
},
45+
wantInstType: "ecs.g7.xlarge",
46+
},
47+
}
48+
49+
for _, tt := range tests {
50+
t.Run(tt.name, func(t *testing.T) {
51+
attrs := tt.instance.GetDeviceAttributes(cloudprovider.DeviceIdentifiers{Name: "bond0"})
52+
if tt.wantInstType != "" {
53+
instAttr, ok := attrs[AttrInstanceType]
54+
if !ok {
55+
t.Fatal("missing instanceType attribute")
56+
}
57+
if instAttr.StringValue == nil || *instAttr.StringValue != tt.wantInstType {
58+
t.Errorf("instanceType = %v, want %s", instAttr.StringValue, tt.wantInstType)
59+
}
60+
}
61+
})
62+
}
63+
}
64+
65+
func TestGetDeviceConfig(t *testing.T) {
66+
t.Run("HPN returns IPVlan config for bond device", func(t *testing.T) {
67+
instance := &AlibabaInstance{IsHPN: true}
68+
config := instance.GetDeviceConfig(cloudprovider.DeviceIdentifiers{Name: "bond0"})
69+
if config == nil {
70+
t.Fatal("expected non-nil config for HPN bond device")
71+
}
72+
if config.Interface.IPVlan == nil {
73+
t.Fatal("expected IPVlan config")
74+
}
75+
if config.Interface.IPVlan.Mode != "ipv6" {
76+
t.Errorf("IPVlan mode = %q, want %q", config.Interface.IPVlan.Mode, "ipv6")
77+
}
78+
})
79+
80+
t.Run("HPN returns nil for non-bond device", func(t *testing.T) {
81+
instance := &AlibabaInstance{IsHPN: true}
82+
config := instance.GetDeviceConfig(cloudprovider.DeviceIdentifiers{Name: "reth0"})
83+
if config != nil {
84+
t.Errorf("expected nil config for bond slave, got %v", config)
85+
}
86+
})
87+
88+
t.Run("non-HPN returns nil", func(t *testing.T) {
89+
instance := &AlibabaInstance{IsHPN: false}
90+
config := instance.GetDeviceConfig(cloudprovider.DeviceIdentifiers{Name: "bond0"})
91+
if config != nil {
92+
t.Errorf("expected nil config for non-HPN, got %v", config)
93+
}
94+
})
95+
}
96+
97+
func TestDetectHPNHPN(t *testing.T) {
98+
tests := []struct {
99+
name string
100+
instanceType string
101+
want bool
102+
}{
103+
{"hpn in name", "ecs.ebmhpn7.320xlarge", true},
104+
{"hpn in name", "ecs.hpn2.xlarge", true},
105+
{"HPN uppercase", "ecs.ebmHPN.large", true},
106+
{"regular instance without infiniband", "ecs.g7.xlarge", false},
107+
}
108+
109+
for _, tt := range tests {
110+
t.Run(tt.name, func(t *testing.T) {
111+
got := detectHPN(tt.instanceType)
112+
// For types without "hpn", the result depends on
113+
// whether /sys/class/infiniband exists on the test host.
114+
// We only assert for positive matches from the instance type.
115+
if tt.want && !got {
116+
t.Errorf("detectHPN(%q) = false, want true", tt.instanceType)
117+
}
118+
})
119+
}
120+
}

pkg/driver/dra_hooks.go

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,47 @@ func (np *NetworkDriver) prepareResourceClaim(ctx context.Context, claim *resour
256256
continue
257257
}
258258

259+
// IPvlan path: cloud provider signals that the parent netdev must remain
260+
// in the host namespace. Instead of moving the netdev, IPvlan slaves are
261+
// created during RunPodSandbox.
262+
if netconf.Interface.IPVlan != nil {
263+
klog.V(2).Infof("IPvlan mode for device %s (mode=%s)", result.Device, netconf.Interface.IPVlan.Mode)
264+
slaves, err := computeIPVlanSlaves(nlHandle)
265+
if err != nil {
266+
errorList = append(errorList, fmt.Errorf("failed to compute IPvlan slaves for device %s: %v", result.Device, err))
267+
continue
268+
}
269+
deviceCfg.IPVlanSlaves = slaves
270+
271+
// RDMA char devices are still needed for container injection.
272+
// Use a dedicated set: shared-mode pods need ALL RDMA char devices
273+
// on the node, not just one device's.
274+
ipvlanCharDevs := sets.New[string]()
275+
rdmaLinks, _ := netlink.RdmaLinkList()
276+
for _, rl := range rdmaLinks {
277+
buildRDMAConfig(rl.Attrs.Name, ipvlanCharDevs)
278+
}
279+
if ipvlanCharDevs.Len() > 0 {
280+
deviceCfg.RDMADevice = RDMAConfig{DevChars: make([]LinuxDevice, 0, ipvlanCharDevs.Len())}
281+
for _, devpath := range ipvlanCharDevs.UnsortedList() {
282+
dev, err := GetDeviceInfo(devpath)
283+
if err != nil {
284+
klog.Warningf("failed to get device info for %s: %v", devpath, err)
285+
continue
286+
}
287+
deviceCfg.RDMADevice.DevChars = append(deviceCfg.RDMADevice.DevChars, dev)
288+
}
289+
}
290+
291+
for _, uid := range podUIDs {
292+
if err := np.podConfigStore.SetDeviceConfig(uid, result.Device, deviceCfg); err != nil {
293+
errorList = append(errorList, fmt.Errorf("failed to persist device config for pod %s device %s: %v", uid, result.Device, err))
294+
}
295+
}
296+
klog.V(4).Infof("IPvlan claim resources for pods %v : %#v", podUIDs, deviceCfg)
297+
continue
298+
}
299+
259300
ifName, err := np.netdb.GetNetInterfaceName(result.Device)
260301
if err != nil {
261302
errorList = append(errorList, fmt.Errorf("failed to get network interface name for device %s: %v", result.Device, err))

0 commit comments

Comments
 (0)