From c8439b1de172951669b399d7aa577724afa4c335 Mon Sep 17 00:00:00 2001 From: Cyclinder Kuo Date: Mon, 27 Apr 2026 22:01:30 +0800 Subject: [PATCH 1/3] Add IaaS network provider integration support * add IaaS provider configuration parameters to values.yaml and README * mount IaaS TLS secret to controller and agent pods * add IaaS provider config to configmap * implement IaaS client with mTLS support for allocate/release IP operations * add IaaS config validation in controller and agent daemon startup * integrate IaaS client into IPAM workflow Signed-off-by: Cyclinder Kuo --- charts/spiderpool/README.md | 6 + ...pool.spidernet.io_spidermultusconfigs.yaml | 1 - charts/spiderpool/templates/configmap.yaml | 2 + charts/spiderpool/values.yaml | 6 + cmd/spiderpool-agent/cmd/daemon.go | 20 + cmd/spiderpool-controller/cmd/config.go | 2 + cmd/spiderpool-controller/cmd/daemon.go | 18 + pkg/gcmanager/gc_manager.go | 10 +- pkg/gcmanager/scanAll_IPPool.go | 19 + pkg/gcmanager/tracePod_worker.go | 27 ++ pkg/iaas/client/client.go | 305 ++++++++++++++++ pkg/iaas/client/types.go | 77 ++++ pkg/iaas/utils/multus.go | 153 ++++++++ pkg/ipam/allocate.go | 8 + pkg/ipam/config.go | 4 + pkg/ipam/iaas.go | 345 ++++++++++++++++++ pkg/ipam/ipam.go | 17 +- pkg/ipam/release.go | 6 + .../v2beta1/spidermultus_types.go | 4 +- pkg/multuscniconfig/multusconfig_informer.go | 3 +- pkg/multuscniconfig/multusconfig_mutate.go | 4 - pkg/multuscniconfig/multusconfig_validate.go | 39 +- pkg/multuscniconfig/utils.go | 2 +- pkg/types/k8s.go | 5 + .../checklists/requirements.md | 115 ++++++ .../data-model.md | 232 ++++++++++++ specs/003-iaas-provider-integration/plan.md | 315 ++++++++++++++++ .../quickstart.md | 277 ++++++++++++++ specs/003-iaas-provider-integration/spec.md | 301 +++++++++++++++ .../tasks-phase2.md | 342 +++++++++++++++++ specs/003-iaas-provider-integration/tasks.md | 333 +++++++++++++++++ 31 files changed, 2960 insertions(+), 38 deletions(-) create mode 100644 pkg/iaas/client/client.go create mode 100644 pkg/iaas/client/types.go create mode 100644 pkg/iaas/utils/multus.go create mode 100644 pkg/ipam/iaas.go create mode 100644 specs/003-iaas-provider-integration/checklists/requirements.md create mode 100644 specs/003-iaas-provider-integration/data-model.md create mode 100644 specs/003-iaas-provider-integration/plan.md create mode 100644 specs/003-iaas-provider-integration/quickstart.md create mode 100644 specs/003-iaas-provider-integration/spec.md create mode 100644 specs/003-iaas-provider-integration/tasks-phase2.md create mode 100644 specs/003-iaas-provider-integration/tasks.md diff --git a/charts/spiderpool/README.md b/charts/spiderpool/README.md index 8267cd67a..ea624302a 100644 --- a/charts/spiderpool/README.md +++ b/charts/spiderpool/README.md @@ -459,3 +459,9 @@ helm install spiderpool spiderpool/spiderpool --wait --namespace kube-system \ | `sriov.injectortls.auto.certExpiration` | server cert expiration for auto method | `73000` | | `sriov.injectortls.auto.extraIpAddresses` | extra IP addresses of server certificate for auto method | `[]` | | `sriov.injectortls.auto.extraDnsNames` | extra DNS names of server cert for auto method | `[]` | + +### IaaS Network Provider Integration + +| Name | Description | Value | +| ------------------------------- | ----------------------------------------------------------------------------------------- | ----- | +| `iaasNetworkProvider.serverUrl` | the URL of the IaaS provider service (host:port). If empty, IaaS integration is disabled. | `""` | diff --git a/charts/spiderpool/crds/spiderpool.spidernet.io_spidermultusconfigs.yaml b/charts/spiderpool/crds/spiderpool.spidernet.io_spidermultusconfigs.yaml index 77fa1bcd7..0701559fc 100644 --- a/charts/spiderpool/crds/spiderpool.spidernet.io_spidermultusconfigs.yaml +++ b/charts/spiderpool/crds/spiderpool.spidernet.io_spidermultusconfigs.yaml @@ -549,7 +549,6 @@ spec: type: integer required: - master - - vlanID type: object type: object type: object diff --git a/charts/spiderpool/templates/configmap.yaml b/charts/spiderpool/templates/configmap.yaml index d208ef1ee..c9cc7946b 100644 --- a/charts/spiderpool/templates/configmap.yaml +++ b/charts/spiderpool/templates/configmap.yaml @@ -36,6 +36,8 @@ data: enabled: {{ .Values.spiderpoolController.podResourceInject.enabled }} namespacesExclude: {{ toJson .Values.spiderpoolController.podResourceInject.namespacesExclude }} namespacesInclude: {{ toJson .Values.spiderpoolController.podResourceInject.namespacesInclude }} + iaasNetworkProvider: + serverUrl: {{ .Values.iaasNetworkProvider.serverUrl | quote }} {{- if .Values.multus.multusCNI.install }} --- kind: ConfigMap diff --git a/charts/spiderpool/values.yaml b/charts/spiderpool/values.yaml index 5f1a930d6..11f613ed3 100644 --- a/charts/spiderpool/values.yaml +++ b/charts/spiderpool/values.yaml @@ -1063,3 +1063,9 @@ sriov: ## @param sriov.injectortls.auto.extraDnsNames extra DNS names of server cert for auto method extraDnsNames: [] + +## @section IaaS Network Provider Integration +## +iaasNetworkProvider: + ## @param iaasNetworkProvider.serverUrl the URL of the IaaS provider service (e.g. http://host:port or https://host:port). Must include scheme. If empty, IaaS integration is disabled. + serverUrl: "" diff --git a/cmd/spiderpool-agent/cmd/daemon.go b/cmd/spiderpool-agent/cmd/daemon.go index 552e92a33..6fd6c4bf0 100644 --- a/cmd/spiderpool-agent/cmd/daemon.go +++ b/cmd/spiderpool-agent/cmd/daemon.go @@ -25,6 +25,7 @@ import ( "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" + iaasClientPkg "github.com/spidernet-io/spiderpool/pkg/iaas/client" "github.com/spidernet-io/spiderpool/pkg/ipam" "github.com/spidernet-io/spiderpool/pkg/ippoolmanager" "github.com/spidernet-io/spiderpool/pkg/kubevirtmanager" @@ -78,6 +79,23 @@ func DaemonMain() { } logger.Sugar().Infof("Spiderpool-agent config: %+v", agentContext.Cfg) + // Validate IaaS provider configuration and create client + if err := iaasClientPkg.ValidateConfig(&agentContext.Cfg.IaaSProviderConfig); err != nil { + logger.Sugar().Warnf("IaaS provider configuration validation failed: %v", err) + } + + // Create IaaS client if configured + var iaasClient iaasClientPkg.Client + if agentContext.Cfg.IaaSProviderConfig.ServerURL != "" { + c, err := iaasClientPkg.NewClient(&agentContext.Cfg.IaaSProviderConfig, logger) + if err != nil { + logger.Sugar().Fatalf("Failed to create IaaS client: %v", err) + } else { + iaasClient = c + logger.Info("IaaS client created successfully") + } + } + // setup sysctls if agentContext.Cfg.TuneSysctlConfig { if err := sysctlConfig(agentContext.Cfg.EnableIPv4, agentContext.Cfg.EnableIPv6); err != nil { @@ -171,10 +189,12 @@ func DaemonMain() { EnableKubevirtStaticIP: agentContext.Cfg.EnableKubevirtStaticIP, EnableReleaseConflictIPsForStateless: agentContext.Cfg.EnableReleaseConflictIPsForStateless, EnableIPConflictDetection: agentContext.Cfg.EnableIPConflictDetection, + IaaSClient: iaasClient, EnableGatewayDetection: agentContext.Cfg.EnableGatewayDetection, OperationRetries: agentContext.Cfg.WaitSubnetPoolMaxRetries, OperationGapDuration: time.Duration(agentContext.Cfg.WaitSubnetPoolTime) * time.Second, AgentNamespace: agentContext.Cfg.AgentPodNamespace, + APIReader: mgr.GetClient(), } if len(agentContext.Cfg.MultusClusterNetwork) != 0 { ipamConfig.MultusClusterNetwork = ptr.To(agentContext.Cfg.MultusClusterNetwork) diff --git a/cmd/spiderpool-controller/cmd/config.go b/cmd/spiderpool-controller/cmd/config.go index 74c08651d..d62f4d07e 100644 --- a/cmd/spiderpool-controller/cmd/config.go +++ b/cmd/spiderpool-controller/cmd/config.go @@ -22,6 +22,7 @@ import ( "github.com/spidernet-io/spiderpool/api/v1/controller/server" "github.com/spidernet-io/spiderpool/pkg/election" "github.com/spidernet-io/spiderpool/pkg/gcmanager" + iaasclient "github.com/spidernet-io/spiderpool/pkg/iaas/client" "github.com/spidernet-io/spiderpool/pkg/ippoolmanager" "github.com/spidernet-io/spiderpool/pkg/kubevirtmanager" "github.com/spidernet-io/spiderpool/pkg/logutils" @@ -191,6 +192,7 @@ type ControllerContext struct { StsManager statefulsetmanager.StatefulSetManager KubevirtManager kubevirtmanager.KubevirtManager Leader election.SpiderLeaseElector + IaaSClient iaasclient.Client // handler HTTPServer *server.Server diff --git a/cmd/spiderpool-controller/cmd/daemon.go b/cmd/spiderpool-controller/cmd/daemon.go index ecd83eb1b..875e7e53c 100644 --- a/cmd/spiderpool-controller/cmd/daemon.go +++ b/cmd/spiderpool-controller/cmd/daemon.go @@ -28,6 +28,7 @@ import ( "github.com/spidernet-io/spiderpool/pkg/election" "github.com/spidernet-io/spiderpool/pkg/event" "github.com/spidernet-io/spiderpool/pkg/gcmanager" + iaasClientPkg "github.com/spidernet-io/spiderpool/pkg/iaas/client" "github.com/spidernet-io/spiderpool/pkg/ippoolmanager" crdclientset "github.com/spidernet-io/spiderpool/pkg/k8s/client/clientset/versioned" "github.com/spidernet-io/spiderpool/pkg/kubevirtmanager" @@ -81,6 +82,22 @@ func DaemonMain() { } logger.Sugar().Infof("Spiderpool-controller config: %+v", controllerContext.Cfg) + // Validate IaaS provider configuration + if err := iaasClientPkg.ValidateConfig(&controllerContext.Cfg.IaaSProviderConfig); err != nil { + logger.Sugar().Warnf("IaaS provider configuration validation failed: %v", err) + } + + // Create IaaS client if configured + if controllerContext.Cfg.IaaSProviderConfig.ServerURL != "" { + c, err := iaasClientPkg.NewClient(&controllerContext.Cfg.IaaSProviderConfig, logger) + if err != nil { + logger.Sugar().Fatalf("Failed to create IaaS client: %v", err) + } else { + controllerContext.IaaSClient = c + logger.Info("IaaS client created successfully") + } + } + // Set up gops. if controllerContext.Cfg.GopsListenPort != "" { address := "127.0.0.1:" + controllerContext.Cfg.GopsListenPort @@ -413,6 +430,7 @@ func initGCManager(ctx context.Context) { controllerContext.KubevirtManager, controllerContext.NodeManager, controllerContext.Leader, + controllerContext.IaaSClient, ) if nil != err { logger.Fatal(err.Error()) diff --git a/pkg/gcmanager/gc_manager.go b/pkg/gcmanager/gc_manager.go index 002ae1532..eb363964c 100644 --- a/pkg/gcmanager/gc_manager.go +++ b/pkg/gcmanager/gc_manager.go @@ -9,6 +9,7 @@ import ( "time" "github.com/spidernet-io/spiderpool/pkg/election" + iaasclient "github.com/spidernet-io/spiderpool/pkg/iaas/client" "github.com/spidernet-io/spiderpool/pkg/ippoolmanager" "github.com/spidernet-io/spiderpool/pkg/kubevirtmanager" "github.com/spidernet-io/spiderpool/pkg/limiter" @@ -76,6 +77,7 @@ type SpiderGC struct { kubevirtMgr kubevirtmanager.KubevirtManager nodeMgr nodemanager.NodeManager leader election.SpiderLeaseElector + iaasClient iaasclient.Client informerFactory informers.SharedInformerFactory gcLimiter limiter.Limiter @@ -90,6 +92,7 @@ func NewGCManager(clientSet *kubernetes.Clientset, config *GarbageCollectionConf kubevirtMgr kubevirtmanager.KubevirtManager, nodeMgr nodemanager.NodeManager, spiderControllerLeader election.SpiderLeaseElector, + iaasClient iaasclient.Client, ) (GCManager, error) { if clientSet == nil { return nil, fmt.Errorf("k8s ClientSet must be specified") @@ -131,9 +134,10 @@ func NewGCManager(clientSet *kubernetes.Clientset, config *GarbageCollectionConf kubevirtMgr: kubevirtMgr, nodeMgr: nodeMgr, - leader: spiderControllerLeader, - gcLimiter: limiter.NewLimiter(limiter.LimiterConfig{}), - Locker: lock.Mutex{}, + leader: spiderControllerLeader, + iaasClient: iaasClient, + gcLimiter: limiter.NewLimiter(limiter.LimiterConfig{}), + Locker: lock.Mutex{}, } return spiderGC, nil diff --git a/pkg/gcmanager/scanAll_IPPool.go b/pkg/gcmanager/scanAll_IPPool.go index 75bf75730..791f34b92 100644 --- a/pkg/gcmanager/scanAll_IPPool.go +++ b/pkg/gcmanager/scanAll_IPPool.go @@ -16,6 +16,7 @@ import ( corev1 "k8s.io/api/core/v1" "github.com/spidernet-io/spiderpool/pkg/constant" + iaasclient "github.com/spidernet-io/spiderpool/pkg/iaas/client" spiderpoolv2beta1 "github.com/spidernet-io/spiderpool/pkg/k8s/apis/spiderpool.spidernet.io/v2beta1" "github.com/spidernet-io/spiderpool/pkg/logutils" "github.com/spidernet-io/spiderpool/pkg/nodemanager" @@ -395,6 +396,24 @@ func (s *SpiderGC) executeScanAll(ctx context.Context) { } else { scanAllLogger.Sugar().Infof("scan all successfully reclaimed the IP %s in IPPool: %s", poolIP, pool.Name) } + + if s.iaasClient != nil { + nodeName := "" + if endpoint != nil { + nodeName = endpoint.Status.Current.Node + } + if releaseErr := s.iaasClient.ReleaseIPs(ctx, &iaasclient.ReleaseIPsRequest{ + PodName: podName, + PodNamespace: podNS, + PodUID: poolIPAllocation.PodUID, + NodeName: nodeName, + IPAddresses: []string{poolIP}, + }); releaseErr != nil { + scanAllLogger.Sugar().Errorf("failed to release IaaS IP '%s', error: '%v'", poolIP, releaseErr) + } else { + scanAllLogger.Sugar().Infof("scan all successfully released IaaS IP %s", poolIP) + } + } } if flagGCEndpoint { err = s.wepMgr.ReleaseEndpointAndFinalizer(logutils.IntoContext(ctx, scanAllLogger), podNS, podName, constant.UseCache) diff --git a/pkg/gcmanager/tracePod_worker.go b/pkg/gcmanager/tracePod_worker.go index 378525e86..5f86689e1 100644 --- a/pkg/gcmanager/tracePod_worker.go +++ b/pkg/gcmanager/tracePod_worker.go @@ -16,6 +16,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "github.com/spidernet-io/spiderpool/pkg/constant" + iaasclient "github.com/spidernet-io/spiderpool/pkg/iaas/client" "github.com/spidernet-io/spiderpool/pkg/metric" "github.com/spidernet-io/spiderpool/pkg/types" "github.com/spidernet-io/spiderpool/pkg/utils/convert" @@ -160,6 +161,32 @@ func (s *SpiderGC) releaseIPPoolIPExecutor(ctx context.Context, workerIndex int) return errRequeue } + // Release IPs from IaaS provider after releasing from internal IPPools + if s.iaasClient != nil { + var ipAddresses []string + for _, detail := range endpoint.Status.Current.IPs { + if detail.IPv4 != nil { + ipAddresses = append(ipAddresses, *detail.IPv4) + } + } + if len(ipAddresses) > 0 { + req := &iaasclient.ReleaseIPsRequest{ + PodName: podCache.PodName, + PodNamespace: podCache.Namespace, + PodUID: podCache.UID, + NodeName: endpoint.Status.Current.Node, + IPAddresses: ipAddresses, + } + if err := s.iaasClient.ReleaseIPs(ctx, req); err != nil { + log.Sugar().Errorf("failed to release IaaS IPs for '%s/%s', error: %v", + podCache.Namespace, podCache.PodName, err) + return err + } + log.Sugar().Infof("successfully released IaaS IPs %v for '%s/%s'", + ipAddresses, podCache.Namespace, podCache.PodName) + } + } + // delete StatefulSet/kubevirtVMI wep (other controller wep has OwnerReference, its lifecycle is same with pod) if (endpoint.Status.OwnerControllerType == constant.KindStatefulSet || endpoint.Status.OwnerControllerType == constant.KindKubevirtVMI) && endpoint.DeletionTimestamp == nil { diff --git a/pkg/iaas/client/client.go b/pkg/iaas/client/client.go new file mode 100644 index 000000000..00dacd04f --- /dev/null +++ b/pkg/iaas/client/client.go @@ -0,0 +1,305 @@ +// Copyright 2025 Authors of spidernet-io +// SPDX-License-Identifier: Apache-2.0 + +package client + +import ( + "bytes" + "context" + "crypto/tls" + "encoding/json" + "fmt" + "io" + "net" + "net/http" + "net/url" + "sync" + "time" + + "go.uber.org/zap" + + spiderpooltypes "github.com/spidernet-io/spiderpool/pkg/types" +) + +const ( + allocateAPIPath = "/v1/apis/network.iaas.io/ipam/allocate-ips" + releaseAPIPath = "/v1/apis/network.iaas.io/ipam/release-ip" +) + +// ParentNicMacLookupFunc is a fallback function to look up parentNicMac +// when the cache does not have the value. It receives the context and the IP CIDR string. +type ParentNicMacLookupFunc func(ctx context.Context, ipCIDR string) (string, error) + +// Client is the interface for IaaS provider API client +type Client interface { + // AllocateIPs calls the IaaS provider to allocate IPs + AllocateIPs(ctx context.Context, req *AllocateIPRequest) (*AllocateIPResponse, error) + // ReleaseIPs calls the IaaS provider to release IPs + ReleaseIPs(ctx context.Context, req *ReleaseIPsRequest) error + // GetCachedParentNicMac returns the cached parent NIC MAC for the given key, + // or empty string if not cached. Key can be SpiderMultusConfig namespace/name + // or IP CIDR string. + GetCachedParentNicMac(key string) (string, bool) + // CacheParentNicMac stores a parent NIC MAC for the given key. + CacheParentNicMac(key string, mac string) + // SetParentNicMacLookupFunc sets a fallback lookup function for parentNicMac + // when cache misses (e.g., after agent restart). + SetParentNicMacLookupFunc(fn ParentNicMacLookupFunc) +} + +// IaaSClient implements the Client interface +type IaaSClient struct { + baseURL string + httpClient *http.Client + logger *zap.Logger + + // parentNicMacCache caches key -> parent NIC MAC address. + // Keys include both SpiderMultusConfig namespace/name and IP CIDR strings, + // so that release path can look up parentNicMac by IP. + parentNicMacCache sync.Map + + // parentNicMacLookupFunc is a fallback function to look up parentNicMac + // when the cache does not have the value (e.g., after agent restart). + parentNicMacLookupFunc ParentNicMacLookupFunc +} + +// ValidateConfig validates the IaaS provider configuration. +// Returns nil if the configuration is valid or IaaS integration is disabled (URL is empty). +func ValidateConfig(cfg *spiderpooltypes.IaaSProviderConfig) error { + if cfg.ServerURL == "" { + return nil + } + u, err := url.Parse(cfg.ServerURL) + if err != nil { + return fmt.Errorf("invalid iaasNetworkProvider.serverUrl %q: %w", cfg.ServerURL, err) + } + if u.Scheme != "http" && u.Scheme != "https" { + return fmt.Errorf("invalid iaasNetworkProvider.serverUrl %q: must start with http:// or https://", cfg.ServerURL) + } + if u.Host == "" { + return fmt.Errorf("invalid iaasNetworkProvider.serverUrl %q: host is empty", cfg.ServerURL) + } + return nil +} + +// NewClient creates a new IaaS client with mTLS configuration +func NewClient(cfg *spiderpooltypes.IaaSProviderConfig, logger *zap.Logger) (*IaaSClient, error) { + if cfg.ServerURL == "" { + return nil, fmt.Errorf("IaaS provider URL is required") + } + if err := ValidateConfig(cfg); err != nil { + return nil, err + } + + // TODO: enable mTLS certificate authentication + tlsConfig := &tls.Config{ + InsecureSkipVerify: true, //nolint:gosec + } + + httpClient := &http.Client{ + Transport: &http.Transport{ + TLSClientConfig: tlsConfig, + }, + Timeout: 30 * time.Second, + } + + return &IaaSClient{ + baseURL: cfg.ServerURL, + httpClient: httpClient, + logger: logger, + }, nil +} + +// AllocateIPs calls the IaaS provider to allocate IPs +func (c *IaaSClient) AllocateIPs(ctx context.Context, req *AllocateIPRequest) (*AllocateIPResponse, error) { + c.logger.Debug("Calling IaaS allocate API", + zap.String("url", c.baseURL), + zap.String("nodeName", req.NodeName), + zap.String("podName", req.PodName), + zap.String("podNamespace", req.PodNamespace), + ) + + // Marshal request body + reqBody, err := json.Marshal(req) + if err != nil { + return nil, fmt.Errorf("failed to marshal allocate request: %w", err) + } + + // Create HTTP request + reqURL, err := url.JoinPath(c.baseURL, allocateAPIPath) + if err != nil { + return nil, fmt.Errorf("failed to construct allocate URL: %w", err) + } + httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, reqURL, bytes.NewBuffer(reqBody)) + if err != nil { + return nil, fmt.Errorf("failed to create allocate request: %w", err) + } + + httpReq.Header.Set("Content-Type", "application/json") + + // Execute request + resp, err := c.httpClient.Do(httpReq) + if err != nil { + c.logger.Error("IaaS allocate API call failed", + zap.Error(err), + zap.String("url", reqURL), + ) + return nil, fmt.Errorf("iaas allocate API call failed: %w", err) + } + defer func() { _ = resp.Body.Close() }() + + // Read response body + respBody, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("failed to read allocate response: %w", err) + } + + // Check status code - accept any 2xx success code + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + c.logger.Error("IaaS allocate API returned non-success status", + zap.Int("statusCode", resp.StatusCode), + zap.String("response", string(respBody)), + ) + return nil, fmt.Errorf("iaas allocate API returned status %d: %s", resp.StatusCode, string(respBody)) + } + + // Unmarshal response + var allocateResp AllocateIPResponse + if err := json.Unmarshal(respBody, &allocateResp); err != nil { + return nil, fmt.Errorf("failed to unmarshal allocate response: %w", err) + } + + c.logger.Info("IaaS allocate API succeeded", + zap.String("nodeName", allocateResp.NodeName), + zap.Int("allocationCount", len(allocateResp.IaaSIPsAllocationResponse)), + ) + + return &allocateResp, nil +} + +// ReleaseIPs calls the IaaS provider to release IPs. +// The provider only supports releasing one IP per request, so this method +// loops over each IP and calls the API individually. +func (c *IaaSClient) ReleaseIPs(ctx context.Context, req *ReleaseIPsRequest) error { + c.logger.Debug("Calling IaaS release API", + zap.String("url", c.baseURL), + zap.String("nodeName", req.NodeName), + zap.String("podName", req.PodName), + zap.String("podNamespace", req.PodNamespace), + zap.Strings("ipAddresses", req.IPAddresses), + ) + + reqURL, err := url.JoinPath(c.baseURL, releaseAPIPath) + if err != nil { + return fmt.Errorf("failed to construct release URL: %w", err) + } + + for _, ip := range req.IPAddresses { + c.logger.Debug("Releasing single IP via IaaS", zap.String("ip", ip)) + + ipstr, ipnet, err := net.ParseCIDR(ip) + if err != nil { + c.logger.Error("Failed to parse IP for release", zap.String("ip", ip), zap.Error(err)) + return fmt.Errorf("failed to parse IP %s: %w", ip, err) + } + + // Look up parentNicMac via lookup function (queries SMC-keyed cache or resolves from SpiderMultusConfig) + var parentNicMac string + if c.parentNicMacLookupFunc != nil { + mac, lookupErr := c.parentNicMacLookupFunc(ctx, ip) + if lookupErr != nil { + c.logger.Warn("Failed to lookup parentNicMac, proceeding with empty value", + zap.String("ip", ip), zap.Error(lookupErr)) + } else { + parentNicMac = mac + } + } else { + c.logger.Warn("No parentNicMac lookup function configured, proceeding with empty value", + zap.String("ip", ip)) + } + + singleReq := &ReleaseIPRequest{ + NodeName: req.NodeName, + IPAddress: ipstr.String(), + Subnet: ipnet.String(), + ParentNicMac: parentNicMac, + } + + if err := c.releaseSingleIP(ctx, reqURL, singleReq); err != nil { + return fmt.Errorf("failed to release IP %s: %w", ip, err) + } + } + + c.logger.Info("IaaS release API succeeded", + zap.String("nodeName", req.NodeName), + zap.Strings("ipAddresses", req.IPAddresses), + ) + + return nil +} + +// releaseSingleIP performs a single IP release API call +func (c *IaaSClient) releaseSingleIP(ctx context.Context, reqURL string, req *ReleaseIPRequest) error { + reqBody, err := json.Marshal(req) + if err != nil { + return fmt.Errorf("failed to marshal release request: %w", err) + } + + httpReq, err := http.NewRequestWithContext(ctx, http.MethodPost, reqURL, bytes.NewBuffer(reqBody)) + if err != nil { + return fmt.Errorf("failed to create release request: %w", err) + } + + httpReq.Header.Set("Content-Type", "application/json") + + resp, err := c.httpClient.Do(httpReq) + if err != nil { + c.logger.Error("IaaS release API call failed", + zap.Error(err), + zap.String("url", reqURL), + zap.String("ipAddresses", req.IPAddress), + ) + return fmt.Errorf("iaas release API call failed: %w", err) + } + defer func() { _ = resp.Body.Close() }() + + respBody, err := io.ReadAll(resp.Body) + if err != nil { + return fmt.Errorf("failed to read release response body: %w", err) + } + + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + c.logger.Error("IaaS release API returned non-success status", + zap.Int("statusCode", resp.StatusCode), + zap.String("response", string(respBody)), + zap.String("ipAddresses", req.IPAddress), + ) + return fmt.Errorf("iaas release API returned status %d: %s", resp.StatusCode, string(respBody)) + } + + return nil +} + +// GetCachedParentNicMac returns the cached parent NIC MAC for the given key, or empty string if not cached. +func (c *IaaSClient) GetCachedParentNicMac(key string) (string, bool) { + if v, ok := c.parentNicMacCache.Load(key); ok { + return v.(string), true + } + return "", false +} + +// CacheParentNicMac stores a parent NIC MAC for the given key. +func (c *IaaSClient) CacheParentNicMac(key string, mac string) { + c.parentNicMacCache.Store(key, mac) +} + +// SetParentNicMacLookupFunc sets a fallback lookup function for parentNicMac +// when cache misses (e.g., after agent restart). +func (c *IaaSClient) SetParentNicMacLookupFunc(fn ParentNicMacLookupFunc) { + c.parentNicMacLookupFunc = fn +} + +// Close closes the IaaS client +func (c *IaaSClient) Close() error { + return nil +} diff --git a/pkg/iaas/client/types.go b/pkg/iaas/client/types.go new file mode 100644 index 000000000..8ef84b171 --- /dev/null +++ b/pkg/iaas/client/types.go @@ -0,0 +1,77 @@ +// Copyright 2025 Authors of spidernet-io +// SPDX-License-Identifier: Apache-2.0 + +package client + +// AllocateIPRequest represents the request body for IaaS IP allocation API +type AllocateIPRequest struct { + // PodName is optional + PodName string `json:"podName,omitempty"` + // PodNamespace is optional + PodNamespace string `json:"podNamespace,omitempty"` + // PodUID is optional + PodUID string `json:"podUID,omitempty"` + // NodeName is required + NodeName string `json:"nodeName"` + // IaaSIPsAllocationRequest is required, at least 1 item + IaaSIPsAllocationRequest []IaaSIPAllocationItem `json:"iaasIPsAllocationRequest"` +} + +// IaaSIPAllocationItem represents a single IP allocation request item +type IaaSIPAllocationItem struct { + // IPAddress is required + IPAddress string `json:"ipAddress"` + // Subnet is required + Subnet string `json:"subnet"` + // ParentNicMac is required + ParentNicMac string `json:"parentNicMac"` +} + +// AllocateIPResponse represents the response from IaaS IP allocation API +type AllocateIPResponse struct { + // PodName from the response + PodName string `json:"podName"` + // PodNamespace from the response + PodNamespace string `json:"podNamespace"` + // NodeName from the response + NodeName string `json:"nodeName"` + // IaaSIPsAllocationResponse contains the allocation results + IaaSIPsAllocationResponse []IaaSIPAllocationResult `json:"iaasIPsAllocationResponse"` +} + +// IaaSIPAllocationResult represents a single IP allocation result +type IaaSIPAllocationResult struct { + // ParentNicMac is the parent NIC MAC address + ParentNicMac string `json:"parentNicMac"` + // Subnet is the subnet CIDR + Subnet string `json:"subnet"` + // IPAddress is the allocated IP address + IPAddress string `json:"ipAddress"` + // MacAddress is the MAC address for the allocated IP + MacAddress string `json:"macAddress"` + // VlanID is the VLAN ID + VlanID int64 `json:"vlanId"` +} + +// ReleaseIPRequest represents the request body for IaaS IP release API +type ReleaseIPsRequest struct { + // PodName is optional + PodName string `json:"podName,omitempty"` + // PodNamespace is optional + PodNamespace string `json:"podNamespace,omitempty"` + // PodUID is optional + PodUID string `json:"podUID,omitempty"` + // NodeName is required + NodeName string `json:"nodeName"` + // IPAddresses are the IPs being released + IPAddresses []string `json:"ipAddresses"` +} + +type ReleaseIPRequest struct { + // NodeName is required + NodeName string `json:"nodeName"` + // IPAddress is the IP being released + IPAddress string `json:"ipAddress"` + Subnet string `json:"subnet"` + ParentNicMac string `json:"parentNicMac"` +} diff --git a/pkg/iaas/utils/multus.go b/pkg/iaas/utils/multus.go new file mode 100644 index 000000000..f11a01556 --- /dev/null +++ b/pkg/iaas/utils/multus.go @@ -0,0 +1,153 @@ +// Copyright 2025 Authors of spidernet-io +// SPDX-License-Identifier: Apache-2.0 + +package utils + +import ( + "context" + "fmt" + "net" + "strconv" + + "github.com/spidernet-io/spiderpool/pkg/constant" + "github.com/spidernet-io/spiderpool/pkg/multuscniconfig" + "github.com/vishvananda/netlink" + corev1 "k8s.io/api/core/v1" +) + +// NetworkSelectionElement represents a network selection element +type NetworkSelectionElement struct { + Name string `json:"name"` + Namespace string `json:"namespace,omitempty"` + Interface string `json:"interface,omitempty"` +} + +// GetParentNicMac gets the parent NIC MAC address for a given interface +// This is a simplified implementation for Phase 2 +// Full implementation should: +// 1. Parse Pod annotation to get SpiderMultusConfig +// 2. Check if it's vlan CNI type +// 3. Get master interface name from config +// 4. Use netlink to get MAC address +func GetParentNicMac(ctx context.Context, pod *corev1.Pod, ifName string) (string, error) { + // For now, get the MAC from the host network interface + // In a full implementation, this would: + // 1. Parse "k8s.v1.cni.cncf.io/networks" annotation + // 2. Find the matching SpiderMultusConfig + // 3. Get master interface from config + // 4. Return master MAC + + // Get the link by name + link, err := netlink.LinkByName(ifName) + if err != nil { + return "", fmt.Errorf("failed to get link %s: %w", ifName, err) + } + + return link.Attrs().HardwareAddr.String(), nil +} + +// MultusNetworkInfo holds the multus network configuration for a specific NIC +type MultusNetworkInfo struct { + // Namespace of the multus network attachment definition + Namespace string + // Name of the multus network attachment definition + Name string +} + +// GetMultusNetworkForNIC retrieves the multus network configuration for a given NIC. +// It parses the pod's Multus annotations (both default network and additional networks) +// and returns the matching network attachment definition info. +// +// Parameters: +// - pod: the pod containing Multus annotations +// - nic: the NIC name (e.g., "eth0", "net1", or "1" for index-based lookup) +// - agentNamespace: the namespace where multus resources are defined (for default network) +// - clusterNetwork: optional cluster default network configuration +// +// Returns the MultusNetworkInfo containing namespace and name of the network attachment definition. +// This function is based on the logic from pkg/ipam/allocate.go and can be used by both +// IPAM and IaaS modules. +func GetMultusNetworkForNIC(pod *corev1.Pod, nic, agentNamespace string, clusterNetwork *string) (*MultusNetworkInfo, error) { + podAnno := pod.GetAnnotations() + + // Check for default NIC (eth0 or index 0) + if nic == constant.ClusterDefaultInterfaceName || nic == strconv.Itoa(0) { + return getDefaultMultusNetwork(podAnno, agentNamespace, clusterNetwork) + } + + // For additional NICs (net1, net2, etc.), parse the networks annotation + return getAdditionalMultusNetwork(podAnno, pod.Namespace, nic) +} + +// getDefaultMultusNetwork retrieves the default multus network configuration. +func getDefaultMultusNetwork(podAnno map[string]string, agentNamespace string, clusterNetwork *string) (*MultusNetworkInfo, error) { + // Check for default network annotation + defaultMultusObj := podAnno[constant.MultusDefaultNetAnnot] + if len(defaultMultusObj) == 0 { + if clusterNetwork == nil { + return nil, fmt.Errorf("no default multus network configured") + } + defaultMultusObj = *clusterNetwork + } + + // Parse the annotation + networks, err := multuscniconfig.ParsePodNetworkAnnotation(defaultMultusObj, agentNamespace) + if err != nil { + return nil, fmt.Errorf("failed to parse default network annotation: %w", err) + } + if len(networks) == 0 { + return nil, fmt.Errorf("empty default network annotation") + } + + // Use the first network as default + ns := networks[0].Namespace + if ns == "" { + ns = agentNamespace + } + + return &MultusNetworkInfo{ + Namespace: ns, + Name: networks[0].Name, + }, nil +} + +// getAdditionalMultusNetwork retrieves the multus network for an additional NIC. +func getAdditionalMultusNetwork(podAnno map[string]string, podNamespace, nic string) (*MultusNetworkInfo, error) { + annotation := podAnno[constant.MultusNetworkAttachmentAnnot] + if annotation == "" { + return nil, fmt.Errorf("no multus network attachment annotation found") + } + + networks, err := multuscniconfig.ParsePodNetworkAnnotation(annotation, podNamespace) + if err != nil { + return nil, fmt.Errorf("failed to parse network attachment annotation: %w", err) + } + + // Find matching network by NIC name or index + for idx, network := range networks { + // Default interface name if not specified + ifName := network.InterfaceRequest + if ifName == "" { + ifName = fmt.Sprintf("net%d", idx+1) + } + + // Match by interface name or index + if nic == ifName || nic == strconv.Itoa(idx+1) { + ns := network.Namespace + if ns == "" { + ns = podNamespace + } + return &MultusNetworkInfo{ + Namespace: ns, + Name: network.Name, + }, nil + } + } + + return nil, fmt.Errorf("no matching multus network found for NIC %s", nic) +} + +// ParseMacAddress parses a MAC address string +func ParseMacAddress(mac string) (net.HardwareAddr, error) { + return net.ParseMAC(mac) +} diff --git a/pkg/ipam/allocate.go b/pkg/ipam/allocate.go index 58c5b91e6..fcd49a7f3 100644 --- a/pkg/ipam/allocate.go +++ b/pkg/ipam/allocate.go @@ -436,6 +436,14 @@ func (i *ipam) allocateInStandardMode(ctx context.Context, addArgs *models.IpamA return nil, err } + if i.config.IaaSClient != nil { + logger.Debug("Calling IaaS provider to allocate IPs", zap.String("nic", *addArgs.IfName)) + if _, iaasErr := i.callIaaSAllocate(ctx, pod, results); iaasErr != nil { + logger.Error("IaaS allocate failed, continuing without IaaS allocation", zap.Error(iaasErr)) + return nil, fmt.Errorf("IaaS IP allocate failed: %w", iaasErr) + } + } + logger.Debug("Group custom routes by IP allocation results") if err = groupCustomRoutes(ctx, customRoutes, results); err != nil { return nil, fmt.Errorf("failed to group custom routes %+v: %w", customRoutes, err) diff --git a/pkg/ipam/config.go b/pkg/ipam/config.go index e89e6f64f..5ecb1d5bd 100644 --- a/pkg/ipam/config.go +++ b/pkg/ipam/config.go @@ -9,8 +9,10 @@ import ( "time" utilerrors "k8s.io/apimachinery/pkg/util/errors" + sigsclient "sigs.k8s.io/controller-runtime/pkg/client" "github.com/spidernet-io/spiderpool/pkg/constant" + "github.com/spidernet-io/spiderpool/pkg/iaas/client" "github.com/spidernet-io/spiderpool/pkg/logutils" ) @@ -32,6 +34,8 @@ type IPAMConfig struct { MultusClusterNetwork *string AgentNamespace string + IaaSClient client.Client + APIReader sigsclient.Reader } func setDefaultsForIPAMConfig(config IPAMConfig) IPAMConfig { diff --git a/pkg/ipam/iaas.go b/pkg/ipam/iaas.go new file mode 100644 index 000000000..16fdffcaf --- /dev/null +++ b/pkg/ipam/iaas.go @@ -0,0 +1,345 @@ +// Copyright 2025 Authors of spidernet-io +// SPDX-License-Identifier: Apache-2.0 + +package ipam + +import ( + "context" + "fmt" + "net" + + "github.com/vishvananda/netlink" + "go.uber.org/zap" + corev1 "k8s.io/api/core/v1" + ctrlclient "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/spidernet-io/spiderpool/pkg/constant" + iaasclient "github.com/spidernet-io/spiderpool/pkg/iaas/client" + iaasutils "github.com/spidernet-io/spiderpool/pkg/iaas/utils" + v2beta1 "github.com/spidernet-io/spiderpool/pkg/k8s/apis/spiderpool.spidernet.io/v2beta1" + "github.com/spidernet-io/spiderpool/pkg/logutils" + spiderpooltypes "github.com/spidernet-io/spiderpool/pkg/types" +) + +// callIaaSAllocate calls the IaaS provider API to allocate IPs +func (i *ipam) callIaaSAllocate(ctx context.Context, pod *corev1.Pod, results []*spiderpooltypes.AllocationResult) (*iaasclient.AllocateIPResponse, error) { + if i.config.IaaSClient == nil { + return nil, nil + } + + logger := logutils.FromContext(ctx).With( + zap.String("pod", pod.Name), + zap.String("namespace", pod.Namespace), + ) + + // Build IaaS allocation request + req := &iaasclient.AllocateIPRequest{ + PodName: pod.Name, + PodNamespace: pod.Namespace, + PodUID: string(pod.UID), + NodeName: pod.Spec.NodeName, + } + + // Build IP-to-result index while constructing the request, so we can later + // merge the IaaS response back into results in O(1) per item. + // result.IP.Address is a CIDR string like "10.0.0.1/24" + ipToResult := make(map[string]*spiderpooltypes.AllocationResult, len(results)) + for _, result := range results { + if result == nil || result.IP == nil || result.IP.Address == nil || result.IP.Nic == nil { + logger.Error("Skipping nil or incomplete allocation result") + return nil, fmt.Errorf("nil or incomplete allocation result") + } + ip, ipNet, err := net.ParseCIDR(*result.IP.Address) + if err != nil { + logger.Error("Failed to parse IP address", zap.String("address", *result.IP.Address), zap.Error(err)) + return nil, fmt.Errorf("failed to parse IP address: %w", err) + } + parentMac, err := i.getParentNicMacFromMultus(ctx, pod, *result.IP.Nic) + if err != nil { + logger.Error("Failed to get parent NIC MAC", zap.String("nic", *result.IP.Nic), zap.Error(err)) + return nil, fmt.Errorf("failed to get parent NIC MAC: %w", err) + } + ipStr := ip.String() + ipToResult[ipStr] = result + + req.IaaSIPsAllocationRequest = append(req.IaaSIPsAllocationRequest, iaasclient.IaaSIPAllocationItem{ + IPAddress: ipStr, + Subnet: ipNet.String(), + ParentNicMac: parentMac, + }) + } + + logger.Debug("Calling IaaS allocate API", + zap.String("podUID", string(pod.UID)), + zap.String("nodeName", pod.Spec.NodeName), + zap.Any("request", req.IaaSIPsAllocationRequest), + ) + + // Call IaaS API + resp, err := i.config.IaaSClient.AllocateIPs(ctx, req) + if err != nil { + logger.Error("IaaS allocate API failed", + zap.String("podUID", string(pod.UID)), + zap.Error(err), + ) + return nil, fmt.Errorf("iaas allocate failed: %w", err) + } + + logger.Debug("IaaS allocate API succeeded", + zap.Any("response", resp.IaaSIPsAllocationResponse), + ) + + // Merge IaaS response data (MAC, VLAN) into results via the pre-built index + for _, iaasResult := range resp.IaaSIPsAllocationResponse { + result, ok := ipToResult[iaasResult.IPAddress] + if !ok { + logger.Error("IaaS response contains unknown IP", zap.String("ip", iaasResult.IPAddress)) + return nil, fmt.Errorf("iaas response contains unknown IP %s", iaasResult.IPAddress) + } + if iaasResult.MacAddress != "" { + result.IP.Mac = iaasResult.MacAddress + } + if iaasResult.VlanID != 0 { + result.IP.Vlan = iaasResult.VlanID + } + } + + return resp, nil +} + +// callIaaSRelease calls the IaaS provider API to release IPs +func (i *ipam) callIaaSRelease(ctx context.Context, endpoint *v2beta1.SpiderEndpoint) error { + if i.config.IaaSClient == nil { + return nil + } + + logger := logutils.FromContext(ctx).With( + zap.String("pod", endpoint.Name), + zap.String("namespace", endpoint.Namespace), + ) + + // Collect all IP addresses from the endpoint allocation details + var ipAddresses []string + for _, detail := range endpoint.Status.Current.IPs { + // only ipv4 now + if detail.IPv4 != nil { + ipAddresses = append(ipAddresses, *detail.IPv4) + } + } + + if len(ipAddresses) == 0 { + logger.Debug("No IP addresses to release via IaaS") + return nil + } + + req := &iaasclient.ReleaseIPsRequest{ + PodName: endpoint.Name, + PodNamespace: endpoint.Namespace, + PodUID: endpoint.Status.Current.UID, + NodeName: endpoint.Status.Current.Node, + IPAddresses: ipAddresses, + } + + logger.Debug("Calling IaaS release API", + zap.String("podUID", endpoint.Status.Current.UID), + zap.String("nodeName", endpoint.Status.Current.Node), + zap.Strings("ipAddresses", ipAddresses), + ) + + if err := i.config.IaaSClient.ReleaseIPs(ctx, req); err != nil { + logger.Error("IaaS release API failed", + zap.String("podUID", endpoint.Status.Current.UID), + zap.Strings("ipAddresses", ipAddresses), + zap.Error(err), + ) + return fmt.Errorf("iaas release failed: %w", err) + } + + logger.Info("IaaS release API succeeded", zap.Strings("ipAddresses", ipAddresses)) + return nil +} + +// getParentNicMacFromMultus gets the parent NIC MAC address by: +// 1. Checking the in-memory cache first (keyed by SpiderMultusConfig namespace/name) +// 2. If not cached: parsing pod's Multus annotation to find the NAD for the given NIC +// 3. Reading SpiderMultusConfig (same name as NAD) to get the master interface +// 4. Using netlink to get the master interface MAC on the host +// 5. Storing the result in cache for future lookups +func (i *ipam) getParentNicMacFromMultus(ctx context.Context, pod *corev1.Pod, nic string) (string, error) { + if i.config.APIReader == nil { + return "", fmt.Errorf("APIReader is not configured") + } + + // Step 1: find the NAD info for this NIC from Multus annotations + netInfo, err := iaasutils.GetMultusNetworkForNIC(pod, nic, i.config.AgentNamespace, i.config.MultusClusterNetwork) + if err != nil { + return "", fmt.Errorf("failed to get multus network for NIC %s: %w", nic, err) + } + + // Step 2: check IaaS client cache using SpiderMultusConfig namespace/name as key + cacheKey := netInfo.Namespace + "/" + netInfo.Name + if cached, ok := i.config.IaaSClient.GetCachedParentNicMac(cacheKey); ok { + return cached, nil + } + + // Step 3: read SpiderMultusConfig (same name/namespace as the NAD) + smc := &v2beta1.SpiderMultusConfig{} + if err := i.config.APIReader.Get(ctx, ctrlclient.ObjectKey{Namespace: netInfo.Namespace, Name: netInfo.Name}, smc); err != nil { + return "", fmt.Errorf("failed to get SpiderMultusConfig %s/%s: %w", netInfo.Namespace, netInfo.Name, err) + } + + // Step 4: extract master interface name from CNI config + masterIface, err := getMasterIfaceFromMultusConfig(smc) + if err != nil { + return "", fmt.Errorf("failed to get master interface from SpiderMultusConfig %s/%s: %w", netInfo.Namespace, netInfo.Name, err) + } + + // Step 5: get MAC address of the master interface via netlink (host netns) + link, err := netlink.LinkByName(masterIface) + if err != nil { + return "", fmt.Errorf("failed to get link %s: %w", masterIface, err) + } + + mac := link.Attrs().HardwareAddr.String() + + // Step 6: store in IaaS client cache for future lookups + i.config.IaaSClient.CacheParentNicMac(cacheKey, mac) + + return mac, nil +} + +// prewarmParentNicMacCache lists all vlan-type SpiderMultusConfigs at startup +// and resolves their master interface MAC addresses into the cache. +// This ensures the cache is populated before any allocate/release calls. +func (i *ipam) prewarmParentNicMacCache(ctx context.Context) { + logger := logutils.FromContext(ctx) + logger.Info("Prewarming parentNicMac cache from SpiderMultusConfigs") + + if i.config.APIReader == nil { + logger.Warn("APIReader is not configured, skip prewarming parentNicMac cache") + return + } + + smcList := &v2beta1.SpiderMultusConfigList{} + if err := i.config.APIReader.List(ctx, smcList); err != nil { + logger.Error("Failed to list SpiderMultusConfigs for cache prewarming", zap.Error(err)) + return + } + + count := 0 + for idx := range smcList.Items { + smc := &smcList.Items[idx] + if smc.Spec.CniType == nil || *smc.Spec.CniType != constant.VlanCNI { + continue + } + + masterIface, err := getMasterIfaceFromMultusConfig(smc) + if err != nil { + continue + } + + cacheKey := smc.Namespace + "/" + smc.Name + // Skip if already cached + if _, ok := i.config.IaaSClient.GetCachedParentNicMac(cacheKey); ok { + continue + } + + link, err := netlink.LinkByName(masterIface) + if err != nil { + logger.Warn("Failed to get link for master interface during prewarm", + zap.String("masterIface", masterIface), + zap.String("smc", cacheKey), + zap.Error(err)) + continue + } + + mac := link.Attrs().HardwareAddr.String() + i.config.IaaSClient.CacheParentNicMac(cacheKey, mac) + count++ + logger.Debug("Prewarmed parentNicMac cache", + zap.String("smc", cacheKey), + zap.String("masterIface", masterIface), + zap.String("mac", mac)) + } + + logger.Info("Finished prewarming parentNicMac cache", zap.Int("count", count)) +} + +// parentNicMacFallbackLookup is a fallback function for the IaaS client to look up +// parentNicMac when the cache does not have the value (e.g., after agent restart). +// It lists all SpiderMultusConfigs with vlan CNI type, gets their master interface +// names, and resolves the MAC address via netlink. +func (i *ipam) parentNicMacFallbackLookup(ctx context.Context, _ string) (string, error) { + logger := logutils.FromContext(ctx) + logger.Info("parentNicMac fallback lookup") + + if i.config.APIReader == nil { + return "", fmt.Errorf("APIReader is not configured") + } + + // List all SpiderMultusConfigs + smcList := &v2beta1.SpiderMultusConfigList{} + if err := i.config.APIReader.List(ctx, smcList); err != nil { + return "", fmt.Errorf("failed to list SpiderMultusConfigs: %w", err) + } + + // Find vlan type SMCs and resolve their master interface MAC + for idx := range smcList.Items { + smc := &smcList.Items[idx] + if smc.Spec.CniType == nil || *smc.Spec.CniType != constant.VlanCNI { + continue + } + + masterIface, err := getMasterIfaceFromMultusConfig(smc) + if err != nil { + // Skip non-vlan or misconfigured SMCs + continue + } + + // Check cache first using SMC namespace/name as key + cacheKey := smc.Namespace + "/" + smc.Name + if cached, ok := i.config.IaaSClient.GetCachedParentNicMac(cacheKey); ok { + return cached, nil + } + + // Get MAC address of the master interface via netlink + link, err := netlink.LinkByName(masterIface) + if err != nil { + logger.Warn("failed to get link for master interface", + zap.String("masterIface", masterIface), + zap.String("smc", cacheKey), + zap.Error(err)) + continue + } + + mac := link.Attrs().HardwareAddr.String() + + // Cache using SMC namespace/name as key + i.config.IaaSClient.CacheParentNicMac(cacheKey, mac) + return mac, nil + } + + return "", fmt.Errorf("no vlan-type SpiderMultusConfig found for parentNicMac lookup") +} + +// getMasterIfaceFromMultusConfig extracts the first master interface name from a SpiderMultusConfig +func getMasterIfaceFromMultusConfig(smc *v2beta1.SpiderMultusConfig) (string, error) { + if smc.Spec.CniType == nil { + return "", fmt.Errorf("CniType is nil") + } + switch *smc.Spec.CniType { + case "vlan": + if smc.Spec.VlanConfig != nil { + if len(smc.Spec.VlanConfig.Master) == 1 { + return smc.Spec.VlanConfig.Master[0], nil + } + if len(smc.Spec.VlanConfig.Master) == 2 && smc.Spec.VlanConfig.Bond != nil { + return smc.Spec.VlanConfig.Bond.Name, nil + } + } + default: + return "", fmt.Errorf("unsupported CniType %s, only support 'vlan'", *smc.Spec.CniType) + } + + return "", fmt.Errorf("no master interface found for CniType %s", *smc.Spec.CniType) +} diff --git a/pkg/ipam/ipam.go b/pkg/ipam/ipam.go index e2f49eb18..3fbe747a6 100644 --- a/pkg/ipam/ipam.go +++ b/pkg/ipam/ipam.go @@ -80,7 +80,7 @@ func NewIPAM( return nil, fmt.Errorf("kubevirt manager %w", constant.ErrMissingRequiredParam) } - return &ipam{ + i := &ipam{ config: setDefaultsForIPAMConfig(config), ipamLimiter: limiter.NewLimiter(limiter.LimiterConfig{}), failure: newFailureCache(), @@ -92,10 +92,23 @@ func NewIPAM( stsManager: stsManager, subnetManager: subnetManager, kubevirtManager: kubevirtManager, - }, nil + } + + // Register parentNicMac fallback lookup on IaaS client so that + // release can recover from cache miss (e.g., after agent restart). + if config.IaaSClient != nil { + config.IaaSClient.SetParentNicMacLookupFunc(i.parentNicMacFallbackLookup) + } + + return i, nil } func (i *ipam) Start(ctx context.Context) error { + // Prewarm parentNicMac cache by listing all vlan-type SpiderMultusConfigs + if i.config.IaaSClient != nil { + i.prewarmParentNicMacCache(ctx) + } + errCh := make(chan error) go func() { if err := i.ipamLimiter.Start(ctx); err != nil { diff --git a/pkg/ipam/release.go b/pkg/ipam/release.go index 756f42273..d44ab1bea 100644 --- a/pkg/ipam/release.go +++ b/pkg/ipam/release.go @@ -150,10 +150,16 @@ func (i *ipam) releaseForAllNICs(ctx context.Context, uid, nic string, endpoint } logger.Sugar().Infof("Release IP allocation details: %v", allocation.IPs) + if err := i.release(ctx, allocation.UID, allocation.IPs); err != nil { return err } + // Call IaaS provider to release IPs after releasing from internal IPPools + if err := i.callIaaSRelease(ctx, endpoint); err != nil { + return err + } + logger.Info("Clean Endpoint") if err := i.endpointManager.RemoveFinalizer(ctx, endpoint); err != nil { return fmt.Errorf("failed to clean Endpoint: %w", err) diff --git a/pkg/k8s/apis/spiderpool.spidernet.io/v2beta1/spidermultus_types.go b/pkg/k8s/apis/spiderpool.spidernet.io/v2beta1/spidermultus_types.go index 0715074c7..bf113f91a 100644 --- a/pkg/k8s/apis/spiderpool.spidernet.io/v2beta1/spidermultus_types.go +++ b/pkg/k8s/apis/spiderpool.spidernet.io/v2beta1/spidermultus_types.go @@ -155,11 +155,11 @@ type SpiderVlanCniConfig struct { // explicitly set MTU to the specified value. Defaults('0' or no value provided) to the value chosen by the kernel. MTU *int32 `json:"mtu,omitempty"` - // +kubebuilder:validation:Required + // +kubebuilder:validation:Optional // +kubebuilder:validation:Minimum=0 // +kubebuilder:validation:Maximum=4094 // The VLAN ID for the CNI configuration and must be within the specified range: [0,4094]. - VlanID *int32 `json:"vlanID"` + VlanID *int32 `json:"vlanID,omitempty"` // +kubebuilder:validation:Optional // Optional bond configuration for the CNI. It must not be nil if the multiple master interfaces are specified. diff --git a/pkg/multuscniconfig/multusconfig_informer.go b/pkg/multuscniconfig/multusconfig_informer.go index 1d8b54a5c..430b0a6fb 100644 --- a/pkg/multuscniconfig/multusconfig_informer.go +++ b/pkg/multuscniconfig/multusconfig_informer.go @@ -616,9 +616,10 @@ func generateVlanCNIConf(disableIPAM bool, multusConfSpec spiderpoolv2beta1.Mult netConf := VlanNetConf{ Type: constant.VlanCNI, Master: masterName, - VlanID: *multusConfSpec.VlanConfig.VlanID, } + netConf.VlanID = multusConfSpec.VlanConfig.VlanID + if multusConfSpec.VlanConfig.MTU != nil { netConf.MTU = multusConfSpec.VlanConfig.MTU } diff --git a/pkg/multuscniconfig/multusconfig_mutate.go b/pkg/multuscniconfig/multusconfig_mutate.go index 8c3c09d13..c6b3aeb82 100644 --- a/pkg/multuscniconfig/multusconfig_mutate.go +++ b/pkg/multuscniconfig/multusconfig_mutate.go @@ -138,10 +138,6 @@ func setVlanDefaultConfig(vlanConfig *spiderpoolv2beta1.SpiderVlanCniConfig) { return } - if vlanConfig.VlanID == nil { - vlanConfig.VlanID = ptr.To(int32(0)) - } - if vlanConfig.RdmaResourceName == nil { vlanConfig.RdmaResourceName = ptr.To("") } diff --git a/pkg/multuscniconfig/multusconfig_validate.go b/pkg/multuscniconfig/multusconfig_validate.go index b7327c728..04fff83dd 100644 --- a/pkg/multuscniconfig/multusconfig_validate.go +++ b/pkg/multuscniconfig/multusconfig_validate.go @@ -101,10 +101,8 @@ func validateCNIConfig(multusConfig *spiderpoolv2beta1.SpiderMultusConfig) *fiel return field.Required(macvlanConfigField, fmt.Sprintf("no %s specified", macvlanConfigField.String())) } - if multusConfig.Spec.MacvlanConfig.VlanID != nil { - if err := validateVlanID(*multusConfig.Spec.MacvlanConfig.VlanID); err != nil { - return field.Invalid(macvlanConfigField, *multusConfig.Spec.MacvlanConfig.VlanID, err.Error()) - } + if err := validateVlanID(multusConfig.Spec.MacvlanConfig.VlanID); err != nil { + return field.Invalid(macvlanConfigField, *multusConfig.Spec.MacvlanConfig.VlanID, err.Error()) } if multusConfig.Spec.MacvlanConfig.MTU != nil && *multusConfig.Spec.MacvlanConfig.MTU < 0 { @@ -140,10 +138,8 @@ func validateCNIConfig(multusConfig *spiderpoolv2beta1.SpiderMultusConfig) *fiel return field.Required(ipvlanConfigField, fmt.Sprintf("no %s specified", ipvlanConfigField.String())) } - if multusConfig.Spec.IPVlanConfig.VlanID != nil { - if err := validateVlanID(*multusConfig.Spec.IPVlanConfig.VlanID); err != nil { - return field.Invalid(ipvlanConfigField, *multusConfig.Spec.IPVlanConfig.VlanID, err.Error()) - } + if err := validateVlanID(multusConfig.Spec.IPVlanConfig.VlanID); err != nil { + return field.Invalid(ipvlanConfigField, *multusConfig.Spec.IPVlanConfig.VlanID, err.Error()) } if multusConfig.Spec.IPVlanConfig.MTU != nil && *multusConfig.Spec.IPVlanConfig.MTU < 0 { @@ -179,11 +175,7 @@ func validateCNIConfig(multusConfig *spiderpoolv2beta1.SpiderMultusConfig) *fiel return field.Required(vlanConfigField, fmt.Sprintf("no %s specified", vlanConfigField.String())) } - if multusConfig.Spec.VlanConfig.VlanID == nil { - return field.Required(vlanConfigField.Child("vlanID"), fmt.Sprintf("no %s specified", vlanConfigField.Child("vlanID").String())) - } - - if err := validateVlanID(*multusConfig.Spec.VlanConfig.VlanID); err != nil { + if err := validateVlanID(multusConfig.Spec.VlanConfig.VlanID); err != nil { return field.Invalid(vlanConfigField, *multusConfig.Spec.VlanConfig.VlanID, err.Error()) } @@ -220,10 +212,8 @@ func validateCNIConfig(multusConfig *spiderpoolv2beta1.SpiderMultusConfig) *fiel return field.Required(sriovConfigField, fmt.Sprintf("no %s specified", sriovConfigField.String())) } - if multusConfig.Spec.SriovConfig.VlanID != nil { - if err := validateVlanID(*multusConfig.Spec.SriovConfig.VlanID); err != nil { - return field.Invalid(sriovConfigField, *multusConfig.Spec.SriovConfig.VlanID, err.Error()) - } + if err := validateVlanID(multusConfig.Spec.SriovConfig.VlanID); err != nil { + return field.Invalid(sriovConfigField, *multusConfig.Spec.SriovConfig.VlanID, err.Error()) } if multusConfig.Spec.SriovConfig.MTU != nil && *multusConfig.Spec.SriovConfig.MTU < 0 { @@ -314,10 +304,8 @@ func validateCNIConfig(multusConfig *spiderpoolv2beta1.SpiderMultusConfig) *fiel return field.Required(ovsConfigField, fmt.Sprintf("no %s specified", ovsConfigField.String())) } - if multusConfig.Spec.OvsConfig.VlanTag != nil { - if err := validateVlanID(*multusConfig.Spec.OvsConfig.VlanTag); err != nil { - return field.Invalid(ovsConfigField, *multusConfig.Spec.OvsConfig.VlanTag, err.Error()) - } + if err := validateVlanID(multusConfig.Spec.OvsConfig.VlanTag); err != nil { + return field.Invalid(ovsConfigField, *multusConfig.Spec.OvsConfig.VlanTag, err.Error()) } for idx, trunk := range multusConfig.Spec.OvsConfig.Trunk { @@ -395,9 +383,12 @@ func validateVlanCNIConfig(master []string, bond *spiderpoolv2beta1.BondConfig) return nil } -func validateVlanID(vlanID int32) error { - if vlanID < 0 || vlanID > 4094 { - return fmt.Errorf("invalid vlanId %v, please make sure vlanId in range [0,4094]", vlanID) +func validateVlanID(vlanID *int32) error { + if vlanID == nil { + return nil + } + if *vlanID < 0 || *vlanID > 4094 { + return fmt.Errorf("invalid vlanId %v, please make sure vlanId in range [0,4094]", *vlanID) } return nil } diff --git a/pkg/multuscniconfig/utils.go b/pkg/multuscniconfig/utils.go index 9281e8678..83694027a 100644 --- a/pkg/multuscniconfig/utils.go +++ b/pkg/multuscniconfig/utils.go @@ -53,7 +53,7 @@ type IPvlanNetConf struct { type VlanNetConf struct { Type string `json:"type"` Master string `json:"master"` - VlanID int32 `json:"vlanId"` + VlanID *int32 `json:"vlanId,omitempty"` MTU *int32 `json:"mtu,omitempty"` IPAM *spiderpoolcmd.IPAMConfig `json:"ipam,omitempty"` } diff --git a/pkg/types/k8s.go b/pkg/types/k8s.go index c47518646..4f7ab2752 100644 --- a/pkg/types/k8s.go +++ b/pkg/types/k8s.go @@ -123,6 +123,7 @@ type SpiderpoolConfigmapConfig struct { EnableValidatingResourcesDeletedWebhook bool `yaml:"enableValidatingResourcesDeletedWebhook"` IpamUnixSocketPath string `yaml:"ipamUnixSocketPath"` PodResourceInjectConfig PodResourceInjectConfig `yaml:"podResourceInject"` + IaaSProviderConfig IaaSProviderConfig `yaml:"iaasNetworkProvider,omitempty"` } type PodResourceInjectConfig struct { @@ -130,3 +131,7 @@ type PodResourceInjectConfig struct { NamespacesExclude []string `yaml:"namespacesExclude"` NamespacesInclude []string `yaml:"namespacesInclude"` } + +type IaaSProviderConfig struct { + ServerURL string `yaml:"serverUrl,omitempty"` +} diff --git a/specs/003-iaas-provider-integration/checklists/requirements.md b/specs/003-iaas-provider-integration/checklists/requirements.md new file mode 100644 index 000000000..56f44df1a --- /dev/null +++ b/specs/003-iaas-provider-integration/checklists/requirements.md @@ -0,0 +1,115 @@ +# Specification Quality Checklist: IaaS Network Provider Integration + +**Purpose**: Validate specification completeness and quality before proceeding to planning +**Created**: 2025-04-27 +**Feature**: [IaaS Network Provider Integration](../spec.md) + +--- + +## Content Quality + +- [x] No implementation details (languages, frameworks, APIs) +- [x] Focused on user value and business needs +- [x] Written for non-technical stakeholders +- [x] All mandatory sections completed + +--- + +## Requirement Completeness + +- [ ] No [NEEDS CLARIFICATION] markers remain + - **Status**: 3 clarification questions pending + - **Details**: See spec section 8 for Q1, Q2, Q3 +- [x] Requirements are testable and unambiguous +- [x] Success criteria are measurable +- [x] Success criteria are technology-agnostic +- [x] All acceptance scenarios are defined +- [x] Edge cases are identified (disabled integration) +- [x] Scope is clearly bounded +- [x] Dependencies and assumptions identified + +--- + +## Feature Readiness + +- [ ] All functional requirements have clear acceptance criteria + - **Note**: FR3.8 and FR4.3 need acceptance criteria refinement after clarifications +- [x] User scenarios cover primary flows + - Allocation flow + - Release flow + - GC flow + - Disabled integration +- [x] Feature meets measurable outcomes defined in Success Criteria +- [x] No implementation details leak into specification + +--- + +## Clarification Status + +### Question 1: IaaS Release API Timing ✅ RESOLVED +**Question**: Should the IaaS release API be called before or after Spiderpool releases the IP from its pool? + +**Decision**: **Option B** - Release Spiderpool IP first, then call IaaS API + +**Rationale**: Prioritizes speed of IP release; IaaS cleanup failures are logged but don't block Spiderpool operations + +**Implementation Notes**: +- Spiderpool IP release proceeds immediately +- IaaS API is called asynchronously or after Spiderpool release +- IaaS cleanup failures are logged and alerted, but don't fail the IP release + +### Question 2: Authentication Mechanism ✅ RESOLVED +**Question**: What authentication mechanism is required for the IaaS provider API? + +**Decision**: **mTLS with client certificates** + +**Implementation Notes**: +- Client certificate and key mounted from Kubernetes secrets +- CA certificate configured to verify IaaS provider server +- Helm values configure secret names and key names +- See spec section 4 (Data Model) for configuration schema + +### Question 3: Parent NIC MAC Discovery ✅ RESOLVED +**Question**: How should the `parentNicMac` be determined? + +**Decision**: Parse Pod's Multus annotation → Identify SpiderMultusConfig → If VLAN CNI, get master NIC name → Retrieve MAC via netlink + +**Implementation Notes**: +- Reuse existing Multus annotation parsing from IPAM code +- Check SpiderMultusConfig for CNI type +- Use netlink to get master interface MAC +- See spec section "9. Implementation Notes" for details + +--- + +## Summary + +**Status**: ✅ **READY FOR TASKS** + +All 3 clarification questions have been resolved: +1. ✅ IaaS Release API timing: **Option B (Spiderpool first, then IaaS)** +2. ✅ Authentication: **mTLS with client certificates** +3. ✅ Parent NIC MAC discovery: **Multus annotation → SpiderMultusConfig → netlink** + +Specification and planning are complete. Ready to proceed to `/speckit.tasks`. + +--- + +## Tasks Generation + +- [x] Tasks organized by phases +- [x] Parallel execution opportunities identified +- [x] Dependencies mapped +- [x] Test criteria defined +- [x] MVP scope identified (T001-T010) + +--- + +## Next Steps + +✅ All tasks generated. Ready to proceed: + +1. ~~Resolve clarification questions~~ ✅ Complete +2. ~~Create implementation plan~~ ✅ Complete +3. ~~Generate tasks~~ ✅ Complete +4. **Proceed to `/speckit.implement`** ⏳ Next step diff --git a/specs/003-iaas-provider-integration/data-model.md b/specs/003-iaas-provider-integration/data-model.md new file mode 100644 index 000000000..31739382b --- /dev/null +++ b/specs/003-iaas-provider-integration/data-model.md @@ -0,0 +1,232 @@ +# Data Model: IaaS Network Provider Integration + +**Feature**: IaaS Network Provider Integration +**Phase**: Phase 1 - Configuration Infrastructure + +--- + +## 1. Configuration Entities + +### IaaSProviderConfig + +The top-level configuration for IaaS provider integration. + +| Field | Type | Required | Default | Description | +|-------|------|----------|---------|-------------| +| URL | string | No | "" | IaaS provider endpoint URL (host:port). Empty disables integration. | +| TLSSecret | TLSSecretConfig | No | {} | TLS certificate configuration for mTLS authentication. Required if URL is set. | + +### TLSSecretConfig + +TLS certificate secret reference. + +| Field | Type | Required | Default | Description | +|-------|------|----------|---------|-------------| +| Name | string | Yes* | "" | Kubernetes secret name containing tls.crt and tls.key | +| Namespace | string | Yes* | "" | Kubernetes namespace where the secret exists | + +\* Required when `URL` is non-empty. + +--- + +## 2. Validation Rules + +### Rule 1: Empty URL Disables Integration + +``` +IF URL == "" THEN + IaaS integration is disabled + TLSSecret is ignored + No validation required +END IF +``` + +### Rule 2: Non-Empty URL Requires TLS Config + +``` +IF URL != "" THEN + IF TLSSecret.Name == "" THEN + ERROR: "TLS secret name is required when IaaS provider URL is configured" + END IF + + IF TLSSecret.Namespace == "" THEN + ERROR: "TLS secret namespace is required when IaaS provider URL is configured" + END IF +END IF +``` + +### Rule 3: Secret Format Validation + +The referenced secret must contain: +- `tls.crt`: Base64-encoded X.509 client certificate +- `tls.key`: Base64-encoded RSA/EC private key + +``` +VALIDATE secret: + - Secret exists in specified namespace + - Secret contains key "tls.crt" + - Secret contains key "tls.key" + - Certificate is valid (not expired) + - Key matches certificate +``` + +--- + +## 3. Environment Variable Mapping + +Configuration is passed via environment variables: + +| Environment Variable | Source Field | Example Value | +|----------------------|--------------|---------------| +| `SPIDERPOOL_IAAS_PROVIDER_URL` | `iaasNetworkProvider.url` | `iaas-provider:444` | +| `SPIDERPOOL_IAAS_TLS_SECRET_NAME` | `iaasNetworkProvider.tlsSecret.name` | `iaas-provider-client-cert` | +| `SPIDERPOOL_IAAS_TLS_SECRET_NAMESPACE` | `iaasNetworkProvider.tlsSecret.namespace` | `spiderpool` | +| `SPIDERPOOL_IAAS_TLS_CERT_PATH` | Hardcoded | `/etc/spiderpool/iaas-tls/tls.crt` | +| `SPIDERPOOL_IAAS_TLS_KEY_PATH` | Hardcoded | `/etc/spiderpool/iaas-tls/tls.key` | + +--- + +## 4. Kubernetes Resource Model + +### User-Created Secret + +```yaml +apiVersion: v1 +kind: Secret +metadata: + name: + namespace: +type: kubernetes.io/tls +data: + tls.crt: + tls.key: +``` + +### Spiderpool Agent Volume Mount + +```yaml +apiVersion: apps/v1 +kind: DaemonSet +spec: + template: + spec: + containers: + - name: spiderpool-agent + volumeMounts: + - name: iaas-tls + mountPath: /etc/spiderpool/iaas-tls + readOnly: true + volumes: + - name: iaas-tls + secret: + secretName: + namespace: + items: + - key: tls.crt + path: tls.crt + - key: tls.key + path: tls.key +``` + +### Spiderpool Controller Volume Mount + +Same structure as Agent. + +--- + +## 5. State Transitions + +### Configuration Loading State Machine + +``` +┌─────────────────┐ +│ Start/Reload │ +└────────┬────────┘ + │ + ▼ +┌─────────────────┐ +│ Load Config │ +│ (env vars) │ +└────────┬────────┘ + │ + ▼ + ┌─────────┐ + │ URL set?│ + └────┬────┘ + │ + ┌──┴──┐ + No│ │Yes + │ │ + ▼ ▼ +┌─────────┐ ┌─────────────────┐ +│ Disabled│ │ Validate Secret │ +│ State │ │ Reference │ +└─────────┘ └────────┬────────┘ + │ + ┌────┴────┐ + OK│ │Error + │ │ + ▼ ▼ + ┌─────────────┐ ┌──────────────┐ + │ Enabled │ │ Error │ + │ State │ │ State │ + │ (Ready for │ │ (Log error, │ + │ Phase 2) │ │ keep retry) │ + └─────────────┘ └──────────────┘ +``` + +--- + +## 6. Error States + +| Error Code | Description | Recovery | +|------------|-------------|----------| +| `CONFIG_INVALID` | Missing required TLS secret config | Fix Helm values and redeploy | +| `SECRET_NOT_FOUND` | Referenced secret doesn't exist | Create secret or fix reference | +| `SECRET_INVALID` | Secret missing tls.crt or tls.key | Recreate secret with correct keys | +| `CERT_EXPIRED` | TLS certificate is expired | Update secret with valid certificate | + +--- + +## 7. Phase 2 Data Model (Future) + +For reference, Phase 2 will add: + +### IaaSAllocateRequest + +```json +{ + "podName": "p1", + "podNamespace": "ns1", + "podUID": "1234567890", + "nodeName": "worker-01", + "iaasIPsAllocationRequest": [ + { + "ipAddress": "10.0.0.10", + "subnet": "10.0.0.0/24", + "parentNicMac": "fa:16:3e:xx:xx:xx" + } + ] +} +``` + +### IaaSAllocateResponse + +```json +{ + "podName": "p1", + "podNamespace": "ns1", + "nodeName": "worker-01", + "iaasIPsAllocationResponse": [ + { + "parentNicMac": "fa:16:3e:xx:xx:xx", + "subnet": "10.251.0.0/24", + "ipAddress": "10.0.0.10", + "macAddress": "fa:16:3e:xx:xx:xx", + "vlanId": 100 + } + ] +} +``` + +These will be used by the API client implemented in Phase 2. diff --git a/specs/003-iaas-provider-integration/plan.md b/specs/003-iaas-provider-integration/plan.md new file mode 100644 index 000000000..bc7f35e23 --- /dev/null +++ b/specs/003-iaas-provider-integration/plan.md @@ -0,0 +1,315 @@ +# Implementation Plan: IaaS Network Provider Integration + +**Feature**: IaaS Network Provider Integration +**Branch**: `003-iaas-provider-integration` +**Created**: 2025-04-27 +**Spec**: [spec.md](./spec.md) + +--- + +## 1. Technical Context + +### Project Structure + +``` +spiderpool/ +├── charts/spiderpool/ # Helm charts +│ ├── values.yaml # Helm values (needs: iaasNetworkProvider config) +│ └── templates/ # Kubernetes manifests +│ ├── daemonset.yaml # Agent daemonset (needs: secret volume mounts) +│ └── deployment.yaml # Controller deployment (needs: secret volume mounts) +│ +├── cmd/ +│ ├── spiderpool-agent/ +│ │ └── cmd/ +│ │ ├── config.go # Agent configuration (needs: IaaS provider config) +│ │ └── daemon.go # Agent initialization (needs: secret validation) +│ │ +│ └── spiderpool-controller/ +│ └── cmd/ +│ ├── config.go # Controller configuration (needs: IaaS provider config) +│ └── daemon.go # Controller initialization (needs: secret validation) +│ +└── pkg/ + └── config/ # Configuration types + └── config.go # Global config struct (needs: IaaSProviderConfig) +``` + +### Key Technologies + +- **Helm**: Kubernetes package manager for templating manifests +- **Kubernetes Secrets**: For storing TLS certificates +- **Volume Mounts**: Mounting secrets into pods +- **Environment Variables**: Passing configuration to applications +- **Go**: Application language for Agent and Controller + +### Existing Similar Implementations + +Looking at Spiderpool's existing Helm chart structure: +- Certificate mounting for webhook: `spiderpool-controller.tls` +- Secret configuration pattern in `values.yaml` +- Volume and volumeMount patterns in daemonset/deployment templates + +--- + +## 2. Constitution Check + +### Constitution Alignment + +| Principle | Alignment | Notes | +|-----------|-----------|-------| +| Minimal Configuration | ✅ | Only 4 fields required: url, secret name, secret namespace | +| Secure by Default | ✅ | mTLS enforced when configured | +| Backward Compatible | ✅ | Empty URL disables integration completely | +| Separation of Concerns | ✅ | Phase 1: Config only, Phase 2: API calls | + +### Gate Evaluation + +**Architecture Gate**: +- No structural changes to core IPAM logic (Phase 1) +- Configuration follows existing Helm patterns +- ✅ PASS + +**Security Gate**: +- mTLS with certificate mounting +- Secrets in dedicated namespace +- No hardcoded credentials +- ✅ PASS + +**Testing Gate**: +- Helm template tests for secret mounting +- Configuration validation tests +- ⏳ PENDING (TBD in tasks) + +--- + +## 3. Design & Contracts + +### Data Model + +#### Configuration Schema + +```yaml +iaasNetworkProvider: + url: string # IaaS provider endpoint (host:port) + tlsSecret: + name: string # Kubernetes secret name + namespace: string # Secret namespace +``` + +#### Environment Variables + +| Variable | Source | Description | +|----------|--------|-------------| +| `SPIDERPOOL_IAAS_PROVIDER_URL` | `values.yaml` → ConfigMap | IaaS provider URL | +| `SPIDERPOOL_IAAS_TLS_SECRET_NAME` | `values.yaml` → ConfigMap | TLS secret name | +| `SPIDERPOOL_IAAS_TLS_SECRET_NAMESPACE` | `values.yaml` → ConfigMap | TLS secret namespace | +| `SPIDERPOOL_IAAS_TLS_CERT_PATH` | Hardcoded | Mounted cert path | +| `SPIDERPOOL_IAAS_TLS_KEY_PATH` | Hardcoded | Mounted key path | + +### Helm Values Contract + +New `values.yaml` section: + +```yaml +iaasNetworkProvider: + # URL of the IaaS provider service (host:port) + # If empty, IaaS integration is disabled + url: "" + + # TLS certificate configuration for mTLS authentication + # Secret must exist and contain tls.crt and tls.key + tlsSecret: + name: "" + namespace: "" +``` + +### Secret Mount Contract + +**Source Secret** (user-provided): +```yaml +apiVersion: v1 +kind: Secret +metadata: + name: iaas-provider-client-cert + namespace: spiderpool +type: kubernetes.io/tls +data: + tls.crt: + tls.key: +``` + +**Mount Target**: +- Path: `/etc/spiderpool/iaas-tls/` +- Files: `tls.crt`, `tls.key` +- Read-only: true + +--- + +## 4. Implementation Phases + +### Phase 0: Research (Complete) + +No research needed. This is a configuration-only task using existing Helm/Kubernetes patterns. + +### Phase 1: Configuration Infrastructure + +#### T001: Update Helm values schema +**File**: `charts/spiderpool/values.yaml` +**Task**: Add `iaasNetworkProvider` configuration section +**Effort**: Small +**Dependencies**: None + +#### T002: Add Agent secret volume mount +**File**: `charts/spiderpool/templates/daemonset.yaml` +**Task**: Add volume and volumeMount for TLS secret in Agent daemonset +**Effort**: Small +**Dependencies**: T001 + +#### T003: Add Controller secret volume mount +**File**: `charts/spiderpool/templates/deployment.yaml` +**Task**: Add volume and volumeMount for TLS secret in Controller deployment +**Effort**: Small +**Dependencies**: T001 + +#### T004: Create ConfigMap template for IaaS config +**File**: `charts/spiderpool/templates/configmap.yaml` (new or existing) +**Task**: Add IaaS provider configuration to ConfigMap +**Effort**: Small +**Dependencies**: T001 + +#### T005: Add Go configuration types +**File**: `pkg/config/config.go` +**Task**: Add `IaaSProviderConfig` struct +**Effort**: Small +**Dependencies**: None + +```go +type IaaSProviderConfig struct { + URL string `yaml:"url"` + TLSSecret TLSSecretConfig `yaml:"tlsSecret"` +} + +type TLSSecretConfig struct { + Name string `yaml:"name"` + Namespace string `yaml:"namespace"` +} +``` + +#### T006: Agent configuration loading +**File**: `cmd/spiderpool-agent/cmd/config.go` +**Task**: Load IaaS provider config from environment +**Effort**: Small +**Dependencies**: T005 + +#### T007: Controller configuration loading +**File**: `cmd/spiderpool-controller/cmd/config.go` +**Task**: Load IaaS provider config from environment +**Effort**: Small +**Dependencies**: T005 + +#### T008: Agent startup validation +**File**: `cmd/spiderpool-agent/cmd/daemon.go` +**Task**: Validate IaaS secret existence at startup (when configured) +**Effort**: Medium +**Dependencies**: T006 + +#### T009: Controller startup validation +**File**: `cmd/spiderpool-controller/cmd/daemon.go` +**Task**: Validate IaaS secret existence at startup (when configured) +**Effort**: Medium +**Dependencies**: T007 + +#### T010: Helm template tests +**Files**: Test files or CI scripts +**Task**: Add tests for Helm template rendering with IaaS config +**Effort**: Medium +**Dependencies**: T002, T003, T004 + +### Phase 2: Future (Not in Current Plan) + +See spec.md for Phase 2 requirements (API client, hooks, MAC storage). + +--- + +## 5. Task Dependencies + +``` +T001 (Helm values) + ├── T002 (Agent volume mount) + ├── T003 (Controller volume mount) + └── T004 (ConfigMap) + +T005 (Go types) + ├── T006 (Agent config loading) + │ └── T008 (Agent validation) + └── T007 (Controller config loading) + └── T009 (Controller validation) + +T002 + T003 + T004 → T010 (Helm tests) +``` + +--- + +## 6. Quickstart + +### Pre-requisites + +1. Kubernetes cluster with Spiderpool installed +2. IaaS provider service endpoint accessible +3. TLS certificate and key for mTLS authentication + +### Setup Steps + +1. **Create TLS Secret**: +```bash +kubectl create secret tls iaas-provider-client-cert \ + --cert=client.crt \ + --key=client.key \ + -n spiderpool +``` + +2. **Configure Spiderpool**: +```bash +helm upgrade spiderpool spiderpool/spiderpool \ + --set iaasNetworkProvider.url="iaas-provider:444" \ + --set iaasNetworkProvider.tlsSecret.name="iaas-provider-client-cert" \ + --set iaasNetworkProvider.tlsSecret.namespace="spiderpool" +``` + +3. **Verify Mount**: +```bash +kubectl exec -n spiderpool ds/spiderpool-agent -- ls /etc/spiderpool/iaas-tls/ +# Should show: tls.crt tls.key +``` + +--- + +## 7. Deliverables + +| Artifact | Path | Status | +|----------|------|--------| +| Spec | `specs/003-iaas-provider-integration/spec.md` | ✅ Complete | +| Plan | `specs/003-iaas-provider-integration/plan.md` | ✅ Complete | +| Checklist | `specs/003-iaas-provider-integration/checklists/requirements.md` | ✅ Complete | +| Helm Values | `charts/spiderpool/values.yaml` | ⏳ T001 | +| Agent Daemonset | `charts/spiderpool/templates/daemonset.yaml` | ⏳ T002 | +| Controller Deployment | `charts/spiderpool/templates/deployment.yaml` | ⏳ T003 | +| ConfigMap | `charts/spiderpool/templates/configmap.yaml` | ⏳ T004 | +| Go Types | `pkg/config/config.go` | ⏳ T005 | +| Agent Config | `cmd/spiderpool-agent/cmd/config.go` | ⏳ T006 | +| Controller Config | `cmd/spiderpool-controller/cmd/config.go` | ⏳ T007 | +| Agent Validation | `cmd/spiderpool-agent/cmd/daemon.go` | ⏳ T008 | +| Controller Validation | `cmd/spiderpool-controller/cmd/daemon.go` | ⏳ T009 | +| Helm Tests | Test files | ⏳ T010 | + +--- + +## 8. Next Steps + +1. Review plan with stakeholders +2. Create feature branch: `003-iaas-provider-integration` +3. Execute Phase 1 tasks (T001-T010) +4. Test Helm chart rendering +5. Validate secret mounting in test cluster +6. Proceed to `/speckit.tasks` for task generation diff --git a/specs/003-iaas-provider-integration/quickstart.md b/specs/003-iaas-provider-integration/quickstart.md new file mode 100644 index 000000000..a56980c9a --- /dev/null +++ b/specs/003-iaas-provider-integration/quickstart.md @@ -0,0 +1,277 @@ +# Quickstart: IaaS Network Provider Integration + +**Feature**: IaaS Network Provider Integration +**Phase**: Phase 1 - Configuration Infrastructure + +--- + +## Overview + +This quickstart guides you through configuring Spiderpool to integrate with an IaaS (Infrastructure as a Service) network provider. In Phase 1, we focus on setting up the configuration and TLS certificate infrastructure. The actual API integration (Phase 2) will be implemented in a future update. + +--- + +## Prerequisites + +1. Kubernetes cluster (v1.21+) +2. Spiderpool v0.9+ installed via Helm +3. IaaS provider service endpoint accessible from the cluster +4. TLS client certificate and key for mTLS authentication with the IaaS provider + +--- + +## Step 1: Prepare TLS Certificates + +Obtain the client certificate and private key from your IaaS provider. You should have: +- `client.crt`: X.509 client certificate +- `client.key`: RSA or EC private key + +--- + +## Step 2: Create Kubernetes Secret + +Create a TLS secret in your Spiderpool namespace (default: `spiderpool`): + +```bash +# Create the secret +kubectl create secret tls iaas-provider-client-cert \ + --cert=client.crt \ + --key=client.key \ + -n spiderpool + +# Verify the secret was created +kubectl get secret iaas-provider-client-cert -n spiderpool +``` + +**Output**: +``` +NAME TYPE DATA AGE +iaas-provider-client-cert kubernetes.io/tls 2 10s +``` + +--- + +## Step 3: Configure Spiderpool + +Update your Spiderpool Helm values with the IaaS provider configuration: + +### Option A: Using Helm CLI + +```bash +helm upgrade spiderpool spiderpool/spiderpool \ + --namespace spiderpool \ + --set iaasNetworkProvider.url="iaas-network-provider:444" \ + --set iaasNetworkProvider.tlsSecret.name="iaas-provider-client-cert" \ + --set iaasNetworkProvider.tlsSecret.namespace="spiderpool" +``` + +### Option B: Using values.yaml + +Edit your `values.yaml`: + +```yaml +iaasNetworkProvider: + # URL of the IaaS provider service (host:port) + url: "iaas-network-provider:444" + + # TLS certificate configuration + tlsSecret: + name: "iaas-provider-client-cert" + namespace: "spiderpool" +``` + +Then apply: + +```bash +helm upgrade spiderpool spiderpool/spiderpool \ + --namespace spiderpool \ + -f values.yaml +``` + +--- + +## Step 4: Verify Configuration + +### 4.1 Check Agent Pods + +```bash +# List agent pods +kubectl get pods -n spiderpool -l app.kubernetes.io/component=spiderpool-agent + +# Check agent logs for IaaS configuration +kubectl logs -n spiderpool ds/spiderpool-agent | grep -i iaas +``` + +**Expected output**: +``` +IaaS provider configuration loaded: url=iaas-network-provider:444 +IaaS TLS secret: name=iaas-provider-client-cert, namespace=spiderpool +``` + +### 4.2 Verify Secret Mount + +```bash +# Check certificate is mounted +kubectl exec -n spiderpool ds/spiderpool-agent -- ls -la /etc/spiderpool/iaas-tls/ +``` + +**Expected output**: +``` +total 4 +drwxrwxrwt 3 root root 120 Apr 27 12:00 . +drwxr-xr-x 1 root root 4096 Apr 27 10:00 .. +drwxr-xr-x 2 root root 40 Apr 27 12:00 ..2025_04_27_12_00_00 +drwxr-xr-x 9 root root 180 Apr 27 12:00 ..data +drwxrwxr-x 2 root root 60 Apr 27 12:00 tls.crt +drwxrwxr-x 2 root root 60 Apr 27 12:00 tls.key +``` + +### 4.3 Check Controller + +```bash +# Check controller logs +kubectl logs -n spiderpool deployment/spiderpool-controller | grep -i iaas +``` + +--- + +## Step 5: Validation Tests + +### Test 1: Certificate Readable + +```bash +# Verify certificate content +kubectl exec -n spiderpool ds/spiderpool-agent -- \ + cat /etc/spiderpool/iaas-tls/tls.crt | head -5 +``` + +**Expected**: X.509 certificate content starting with `-----BEGIN CERTIFICATE-----` + +### Test 2: Configuration Loaded in Agent/Controller + +```bash +# Check Agent logs for IaaS configuration +kubectl logs -n spiderpool ds/spiderpool-agent | grep -i "iaas" + +# Check Controller logs for IaaS configuration +kubectl logs -n spiderpool deployment/spiderpool-controller | grep -i "iaas" +``` + +**Expected output**: +``` +IaaS provider configuration detected {"url": "iaas-network-provider:444"} +IaaS provider TLS configuration {"secretName": "iaas-provider-client-cert", "secretNamespace": "spiderpool"} +``` + +### Test 3: Configuration via ConfigMap + +```bash +# Verify ConfigMap contains IaaS configuration +kubectl get configmap -n spiderpool spiderpool-conf -o yaml | grep -A10 "iaasNetworkProvider" +``` + +**Expected output**: +```yaml +iaasNetworkProvider: + url: "iaas-network-provider:444" + tlsSecret: + name: "iaas-provider-client-cert" + namespace: "spiderpool" + tlsCertPath: "/etc/spiderpool/iaas-tls/tls.crt" + tlsKeyPath: "/etc/spiderpool/iaas-tls/tls.key" +``` + +--- + +## Disabling IaaS Integration + +To disable the IaaS integration: + +```bash +helm upgrade spiderpool spiderpool/spiderpool \ + --namespace spiderpool \ + --set iaasNetworkProvider.url="" +``` + +Or in `values.yaml`: + +```yaml +iaasNetworkProvider: + url: "" +``` + +**Note**: When URL is empty, the TLS secret configuration is ignored and no secret mounting occurs. + +--- + +## Troubleshooting + +### Issue: Secret not found + +**Error**: +``` +Error: secret "iaas-provider-client-cert" not found +``` + +**Solution**: +1. Verify secret exists: `kubectl get secret -n spiderpool` +2. Check namespace: Ensure secret is in the same namespace as specified in `tlsSecret.namespace` +3. Create secret: Follow Step 2 + +### Issue: Certificate not mounted + +**Symptom**: `/etc/spiderpool/iaas-tls/` directory is empty + +**Solution**: +1. Check Helm values are applied: `helm get values spiderpool -n spiderpool` +2. Verify URL is not empty: `iaasNetworkProvider.url` must be set +3. Restart pods: `kubectl rollout restart daemonset/spiderpool-agent -n spiderpool` + +### Issue: Agent fails to start + +**Error**: +``` +Failed to validate IaaS configuration: TLS secret name is required +``` + +**Solution**: Set both `tlsSecret.name` and `tlsSecret.namespace` when `url` is configured. + +--- + +## Next Steps + +After Phase 1 configuration is complete and validated: + +1. **Monitor**: Watch for Phase 2 implementation updates +2. **API Integration**: Phase 2 will add the actual IaaS API client +3. **MAC Storage**: Returned MAC addresses from IaaS provider will be stored in SpiderEndpoint + +--- + +## Example: Complete values.yaml + +```yaml +# Spiderpool Helm values with IaaS provider configuration + +spiderpoolAgent: + image: + tag: "v0.9.0" + +spiderpoolController: + image: + tag: "v0.9.0" + +# IaaS provider integration configuration +iaasNetworkProvider: + # Required: IaaS provider URL (host:port) + # Set to empty string "" to disable integration + url: "iaas-network-provider:444" + + # Required when URL is set: TLS secret configuration + tlsSecret: + # Kubernetes secret name containing tls.crt and tls.key + name: "iaas-provider-client-cert" + + # Kubernetes namespace where the secret exists + namespace: "spiderpool" +``` diff --git a/specs/003-iaas-provider-integration/spec.md b/specs/003-iaas-provider-integration/spec.md new file mode 100644 index 000000000..3d9eb6041 --- /dev/null +++ b/specs/003-iaas-provider-integration/spec.md @@ -0,0 +1,301 @@ +# Specification: IaaS Network Provider Integration + +**Branch Name**: `003-iaas-provider-integration` +**Short Name**: `iaas-provider-integration` +**Feature Number**: `003` + +--- + +## 1. User Story + +**As a** cluster administrator using Spiderpool with a third-party cloud provider +**I want** Spiderpool to integrate with the cloud provider's IaaS IP management API +**So that** IP allocations in Spiderpool are synchronized with the cloud provider's network infrastructure, ensuring proper MAC address assignment and VLAN configuration for Pods + +--- + +## 2. User Scenarios + +### Scenario 1: Pod IP Allocation with IaaS Integration + +**Given**: A Pod is being created with Spiderpool IPAM +**And**: The `iaas-network-provider` URL is configured in Spiderpool Agent +**When**: Spiderpool Agent allocates IPs for the Pod +**Then**: +- Spiderpool Agent calls the IaaS provider's `/v1/iaas.network.io/ipam/allocate-ips` API +- The request includes node name, IP addresses, subnets, and parent NIC MAC +- The IaaS provider returns MAC addresses and VLAN IDs for the allocated IPs +- Spiderpool Agent stores the returned MAC addresses in SpiderEndpoint +- The Pod receives the complete network configuration (IP, MAC, VLAN) + +### Scenario 2: Pod IP Release with IaaS Integration + +**Given**: A Pod is being deleted or its IPs are being released +**And**: The `iaas-network-provider` URL is configured +**When**: Spiderpool Agent releases IPs for the Pod +**Then**: +- Spiderpool Agent calls the IaaS provider's `/v1/iaas.network.io/ipam/release-ip` API +- The IaaS provider releases the corresponding IaaS resources +- The IP release in Spiderpool is completed successfully + +### Scenario 3: IP Garbage Collection with IaaS Integration + +**Given**: The Spiderpool Controller is performing IP garbage collection +**And**: Orphaned IPs are identified for Pods that no longer exist +**And**: The `iaas-network-provider` URL is configured in Spiderpool Controller +**When**: Spiderpool Controller releases the orphaned IPs +**Then**: +- Spiderpool Controller calls the IaaS provider's release API for each orphaned IP +- The IaaS resources are properly cleaned up +- The IPs are released from Spiderpool + +### Scenario 4: Disabled IaaS Integration + +**Given**: The `iaas-network-provider` URL is empty or not configured +**When**: Spiderpool performs IP allocation or release +**Then**: Spiderpool operates normally without calling any IaaS provider API + +--- + +## 3. Functional Requirements + +### FR1: Configuration + +| ID | Requirement | Priority | +|----|-------------|----------| +| FR1.1 | Spiderpool Helm values must support `iaasNetworkProvider.url` configuration | Must | +| FR1.2 | The URL format should be `host:port` (e.g., `iaas-network-provider:444`) | Must | +| FR1.3 | Both Spiderpool Agent and Controller must read the IaaS provider URL from configuration | Must | +| FR1.4 | If the URL is empty, IaaS integration is disabled | Must | + +### FR2: IaaS API Client (Phase 2 - Future) + +| ID | Requirement | Priority | +|----|-------------|----------| +| FR2.1 | Implement an HTTP client for communicating with the IaaS provider | Must | +| FR2.2 | The client must support the allocate API: `POST /v1/iaas.network.io/ipam/allocate-ips` | Must | +| FR2.3 | The client must support the release API: `POST /v1/iaas.network.io/ipam/release-ip` | Must | +| FR2.4 | The client must have configurable timeout and retry logic | Should | +| FR2.5 | The client must handle API errors gracefully with proper logging | Must | +| FR2.6 | The client must support mTLS using mounted certificates | Must | + +### FR3: Configuration Infrastructure (Phase 1) + +| ID | Requirement | Priority | +|----|-------------|----------| +| FR3.1 | Helm chart supports `iaasNetworkProvider.url` configuration | Must | +| FR3.2 | Helm chart supports `iaasNetworkProvider.tlsSecret.name` configuration | Must | +| FR3.3 | Helm chart supports `iaasNetworkProvider.tlsSecret.namespace` configuration | Must | +| FR3.4 | Secret containing `tls.crt` and `tls.key` is mounted into Agent pod | Must | +| FR3.5 | Secret containing `tls.crt` and `tls.key` is mounted into Controller pod | Must | +| FR3.6 | Configuration is passed to Agent via environment variables | Must | +| FR3.7 | Configuration is passed to Controller via environment variables | Must | +| FR3.8 | Agent validates secret existence at startup (when URL is configured) | Should | +| FR3.9 | Controller validates secret existence at startup (when URL is configured) | Should | + +### FR4: IP Allocation Flow (Phase 2 - Future) + +| ID | Requirement | Priority | +|----|-------------|----------| +| FR4.1 | After Spiderpool allocates IPs, if IaaS provider is configured, call the allocate API | Must | +| FR4.2 | The allocate request must include: `nodeName` (required), `iaasIPsAllocationRequest` array (required) | Must | +| FR4.3 | Each allocation request item must include: `ipAddress`, `subnet`, `parentNicMac` (all required) | Must | +| FR4.4 | Optional fields may include: `podName`, `podNamespace`, `podUID` | Should | +| FR4.5 | On successful response, extract `macAddress` and `vlanId` from the response | Must | +| FR4.6 | Store the returned `macAddress` in SpiderEndpoint's `IPAllocationDetail.MAC` field | Must | +| FR4.7 | Store the returned `vlanId` in SpiderEndpoint's `IPAllocationDetail.Vlan` field | Must | +| FR4.8 | If the IaaS API call fails, the IP allocation should fail with appropriate error | Must | + +### FR5: IP Release Flow (Phase 2 - Future) + +| ID | Requirement | Priority | +|----|-------------|----------| +| FR5.1 | Before or during Spiderpool IP release, if IaaS provider is configured, call the release API | Must | +| FR5.2 | The release request must include information to identify the IPs being released | Must | +| FR5.3 | The release flow should proceed even if IaaS API call fails (with logging) | Should | +| FR5.4 | IP Garbage Collection in Controller must also trigger IaaS release calls | Must | + +### FR6: Error Handling and Observability (Phase 2 - Future) + +| ID | Requirement | Priority | +|----|-------------|----------| +| FR6.1 | All IaaS API calls must be logged with context (Pod info, IPs, result) | Must | +| FR6.2 | Metrics should track IaaS API call latency and success/failure rates | Should | +| FR6.3 | Clear error messages when IaaS integration fails | Must | +| FR6.4 | Support for configuring API timeouts to avoid blocking IPAM operations | Should | + +--- + +## 4. Data Model + +### IaaS IP Allocation Request + +```json +{ + "podName": "p1", + "podNamespace": "ns1", + "podUID": "1234567890", + "nodeName": "worker-01", + "iaasIPsAllocationRequest": [ + { + "ipAddress": "10.0.0.10", + "subnet": "10.0.0.0/24", + "parentNicMac": "fa:16:3e:xx:xx:xx" + } + ] +} +``` + +### IaaS IP Allocation Response + +```json +{ + "podName": "p1", + "podNamespace": "ns1", + "nodeName": "worker-01", + "iaasIPsAllocationResponse": [ + { + "parentNicMac": "fa:16:3e:xx:xx:xx", + "subnet": "10.251.0.0/24", + "ipAddress": "10.0.0.10", + "macAddress": "fa:16:3e:xx:xx:xx", + "vlanId": 100 + } + ] +} +``` + +### Configuration Schema + +```yaml +iaasNetworkProvider: + # URL of the IaaS provider service (host:port) + # If empty, IaaS integration is disabled + url: "iaas-network-provider:444" + + # TLS certificate configuration for mTLS authentication + # Secret must exist and contain tls.crt and tls.key + tlsSecret: + name: "iaas-provider-client-cert" # Kubernetes secret name + namespace: "spiderpool" # Secret namespace +``` + +### Phase 1: Configuration and Secret Mounting (Current) + +This phase implements the Helm configuration and Kubernetes secret mounting infrastructure: + +1. **Helm Values**: Support `iaasNetworkProvider` configuration block +2. **Secret Mounting**: Mount specified secrets as volumes into Agent and Controller pods +3. **Config Propagation**: Pass configuration to Agent and Controller via environment variables or config files +4. **Validation**: Validate secret existence and format at startup + +**Note**: The actual IaaS API client implementation and calling logic will be implemented in Phase 2. + +### Phase 2: API Implementation (Future) + +This phase will implement the actual IaaS API integration: + +1. **API Client**: HTTP client with mTLS support +2. **Allocation Hook**: Call IaaS allocate API after Spiderpool IP allocation +3. **Release Hook**: Call IaaS release API during IP release +4. **MAC Storage**: Store returned MAC addresses in SpiderEndpoint + +--- + +## 5. Success Criteria + +| ID | Criterion | Measurement | +|----|-----------|-------------| +| SC1 | IP allocation with IaaS integration completes within 5 seconds (including IaaS API call) | 95th percentile latency < 5s | +| SC2 | IaaS-allocated MAC addresses are correctly stored and retrievable via WorkloadEndpoint API | 100% of allocations have MAC stored | +| SC3 | IP release properly triggers IaaS cleanup | Zero orphaned IaaS resources after Pod deletion | +| SC4 | When IaaS provider is disabled, Spiderpool operates with no performance impact | Latency difference < 10ms vs enabled | +| SC5 | IaaS integration is transparent to CNI plugins | No changes required in CNI configuration | + +### Phase 1 Success Criteria + +| ID | Criterion | Measurement | +|----|-----------|-------------| +| SC-P1-1 | Helm values support complete IaaS provider configuration | All config options renderable via Helm | +| SC-P1-2 | Secrets are mounted into Agent and Controller pods | Secret files accessible at configured paths | +| SC-P1-3 | Configuration is passed to components | Environment variables or config files populated | +| SC-P1-4 | Components start successfully with configuration | No startup errors when IaaS config is provided | + +--- + +## 6. Assumptions and Dependencies + +### Assumptions + +- The IaaS provider API is available and responsive +- The IaaS provider returns valid MAC addresses in standard format +- The `parentNicMac` is obtained by parsing Pod's Multus annotation to identify SpiderMultusConfig +- For VLAN CNI type, the master NIC's MAC address is retrieved via netlink +- Network connectivity exists between Spiderpool Agent/Controller and the IaaS provider + +### Dependencies + +- Existing WorkloadEndpoint Query API implementation (for MAC storage/retrieval) +- SpiderEndpoint CRD with MAC field (already implemented) +- IPAM allocation/deallocation hooks in Spiderpool Agent +- IP GC mechanism in Spiderpool Controller + +--- + +## 7. Risks and Mitigations + +| Risk | Impact | Mitigation | +|------|--------|------------| +| IaaS provider API unavailable | High - blocks IP allocation | Implement circuit breaker, configurable timeout, option to disable integration | +| IaaS provider returns invalid data | Medium - incorrect network config | Validate response format, implement retry with backoff | +| Increased IP allocation latency | Medium - slower Pod startup | Configure appropriate timeouts, async processing if possible | +| Security concerns with external API | Medium - potential data exposure | Support mTLS, API authentication tokens via Kubernetes secrets | + +--- + +## 8. Open Questions + +- [RESOLVED: 1] Should the IaaS release API be called before or after Spiderpool releases the IP from its pool? + - **Answer**: Option B - Release Spiderpool IP first, then call IaaS API. This prioritizes speed; IaaS cleanup failures are logged but don't block Spiderpool release. +- [RESOLVED: 2] What authentication mechanism is required for the IaaS provider API? + - **Answer**: mTLS with client certificates. The certificate and key will be mounted from Kubernetes secrets via Helm configuration. +- [RESOLVED: 3] How should the `parentNicMac` be determined? + - **Answer**: Parse Pod's Multus annotation to identify SpiderMultusConfig instance. If CNI type is VLAN, get master NIC name from config and retrieve MAC via netlink. + +--- + +## 9. Implementation Notes + +### Parent NIC MAC Discovery + +The `parentNicMac` parameter in the IaaS allocate request is determined as follows: + +1. **Parse Multus Annotation**: Read the Pod's `k8s.v1.cni.cncf.io/networks` annotation to identify network attachments +2. **Lookup SpiderMultusConfig**: Match the network name to a SpiderMultusConfig CRD instance +3. **Check CNI Type**: If the SpiderMultusConfig uses VLAN CNI type: + - Extract `master` NIC name from the CNI configuration + - Use netlink to retrieve the MAC address of the master interface +4. **Reuse Existing Code**: Leverage existing Multus annotation parsing code from the IPAM allocation path + +This approach ensures consistency with Spiderpool's existing network configuration handling. + +--- + +## 10. Appendix + +### API Endpoint Details + +**Allocate API:** +- Method: POST +- Path: `/v1/iaas.network.io/ipam/allocate-ips` +- Content-Type: application/json + +**Release API:** +- Method: POST +- Path: `/v1/iaas.network.io/ipam/release-ip` +- Content-Type: application/json + +### Related Features + +- 002-workloadendpoint-query-api: Provides MAC retrieval capability +- SpiderEndpoint CRD with MAC field: Storage for IaaS-returned MAC addresses diff --git a/specs/003-iaas-provider-integration/tasks-phase2.md b/specs/003-iaas-provider-integration/tasks-phase2.md new file mode 100644 index 000000000..7f530d142 --- /dev/null +++ b/specs/003-iaas-provider-integration/tasks-phase2.md @@ -0,0 +1,342 @@ +# Tasks: IaaS Network Provider Integration - Phase 2 + +**Feature**: IaaS Network Provider Integration - Phase 2 (API Implementation) +**Branch**: `003-iaas-provider-integration` +**Created**: 2025-04-27 +**Depends On**: Phase 1 (Configuration and Secret Mounting) + +--- + +## Overview + +This task list implements Phase 2 of the IaaS Network Provider Integration: API Client and IPAM Hooks. This phase integrates the actual IaaS API calls into Spiderpool's IP allocation and release flows. + +**Total Tasks**: 12 +**Estimated Effort**: Medium (API client + IPAM integration) +**Parallel Tasks**: T101-T102, T104-T105, T107-T108 + +--- + +## Phase 1: IaaS API Client + +**Goal**: Create HTTP client with mTLS for IaaS provider communication +**User Story**: FR2 (IaaS API Client) +**Independent Test Criteria**: +- HTTP client can make mTLS requests with test certificates +- Unit tests pass for request/response serialization + +### Implementation Tasks + +- [x] **T101** Create IaaS API client package structure + - **File**: `pkg/iaas/client/client.go` (new package) + - **Description**: Define IaaSClient interface and basic structure + - **Code**: + ```go + type Client interface { + AllocateIPs(ctx context.Context, req *AllocateIPRequest) (*AllocateIPResponse, error) + ReleaseIPs(ctx context.Context, req *ReleaseIPRequest) error + } + + type IaaSClient struct { + baseURL string + httpClient *http.Client + certPath string + keyPath string + } + ``` + +- [x] **T102** [P] Implement mTLS HTTP client initialization + - **File**: `pkg/iaas/client/client.go` + - **Description**: Load certificates and create http.Client with TLS config + - **Code**: + ```go + func NewClient(cfg *types.IaaSProviderConfig) (*IaaSClient, error) { + // Load client certificate + // Create TLS config + // Initialize http.Client + } + ``` + +- [x] **T103** Implement request/response types + - **File**: `pkg/iaas/client/types.go` + - **Description**: Define Go structs matching API spec + - **Structs**: + ```go + type AllocateIPRequest struct { + PodName string `json:"podName,omitempty"` + PodNamespace string `json:"podNamespace,omitempty"` + PodUID string `json:"podUID,omitempty"` + NodeName string `json:"nodeName"` + IaaSIPsAllocationRequest []IaaSIPAllocationItem `json:"iaasIPsAllocationRequest"` + } + + type IaaSIPAllocationItem struct { + IPAddress string `json:"ipAddress"` + Subnet string `json:"subnet"` + ParentNicMac string `json:"parentNicMac"` + } + + type AllocateIPResponse struct { + PodName string `json:"podName"` + PodNamespace string `json:"podNamespace"` + NodeName string `json:"nodeName"` + IaaSIPsAllocationResponse []IaaSIPAllocationResult `json:"iaasIPsAllocationResponse"` + } + + type IaaSIPAllocationResult struct { + ParentNicMac string `json:"parentNicMac"` + Subnet string `json:"subnet"` + IPAddress string `json:"ipAddress"` + MacAddress string `json:"macAddress"` + VlanID int64 `json:"vlanId"` + } + + type ReleaseIPRequest struct { + PodName string `json:"podName,omitempty"` + PodNamespace string `json:"podNamespace,omitempty"` + PodUID string `json:"podUID,omitempty"` + NodeName string `json:"nodeName"` + IPAddresses []string `json:"ipAddresses"` + } + ``` + +- [x] **T104** [P] Implement AllocateIPs API call + - **File**: `pkg/iaas/client/allocate.go` + - **Description**: POST /v1/iaas.network.io/ipam/allocate-ips + - **Details**: + - POST to `/v1/iaas.network.io/ipam/allocate-ips` + - Marshal request body + - Handle response + - Return error on non-200 status + - Timeout: 30 seconds + +- [x] **T105** [P] Implement ReleaseIPs API call + - **File**: `pkg/iaas/client/release.go` + - **Description**: POST /v1/iaas.network.io/ipam/release-ip + - **Details**: + - POST to `/v1/iaas.network.io/ipam/release-ip` + - Marshal request body + - Return error on non-200 status + - Timeout: 30 seconds + +- [x] **T106** Add logging and observability + - **Files**: `pkg/iaas/client/*.go` + - **Description**: Add structured logging for all API calls + - **Requirements**: + - Log request with context (pod info) + - Log response or error + - Track latency metrics (optional for Phase 2) + +--- + +## Phase 2: IPAM Integration + +**Goal**: Integrate IaaS API calls into IPAM Allocate flow +**User Story**: FR4 (IP Allocation Flow) +**Independent Test Criteria**: +- IP allocation with IaaS configured calls the API +- MAC/VLAN returned from IaaS are stored in SpiderEndpoint + +### Implementation Tasks + +- [x] **T107** [P] Get parent NIC MAC from Multus annotation + - **File**: `pkg/iaas/utils/multus.go` + - **Description**: Parse Pod annotation to get parent NIC MAC + - **Function**: + ```go + func GetParentNicMac(ctx context.Context, pod *corev1.Pod, ifName string) (string, error) + ``` + - **Algorithm**: + 1. Read `k8s.v1.cni.cncf.io/networks` annotation + 2. Parse to get SpiderMultusConfig name + 3. Check if CNI type is vlan + 4. Get master interface name + 5. Use netlink to get MAC address + +- [x] **T108** [P] Integrate IaaS allocate into IPAM + - **File**: `pkg/ipam/allocate.go` (existing or new file) + - **Description**: After Spiderpool IP allocation, call IaaS API + - **Integration Point**: In `Allocate()` method, after IP allocation, before returning + - **Code Flow**: + ```go + // After Spiderpool allocates IPs + if iaasClient != nil { + // Build IaaS request + req := buildIaaSAllocateRequest(pod, results, parentNicMac) + + // Call IaaS API + resp, err := iaasClient.AllocateIPs(ctx, req) + if err != nil { + return nil, fmt.Errorf("iaas allocation failed: %w", err) + } + + // Merge IaaS response into allocation results + mergeIaaSResponse(results, resp) + } + ``` + +- [x] **T109** Store MAC/VLAN in SpiderEndpoint (already in Phase 1) + - **File**: `pkg/utils/convert/convert.go` (existing) or IPAM + - **Description**: Store returned MAC and VLAN in IPAllocationDetail + - **Already Done**: MAC field already in IPAllocationDetail from previous PR + - **Task**: Ensure IaaS response MAC/VLAN are passed to conversion function + +--- + +## Phase 3: IP Release Integration + +**Goal**: Integrate IaaS API calls into IPAM Release flow +**User Story**: FR5 (IP Release Flow) +**Independent Test Criteria**: +- IP release calls IaaS API +- Release proceeds even if IaaS call fails + +### Implementation Tasks + +- [x] **T110** Integrate IaaS release into IPAM Release + - **File**: `pkg/ipam/release.go` (existing) + - **Description**: Before or during Spiderpool release, call IaaS API + - **Requirements**: + - Call IaaS release API with IP addresses being released + - Log IaaS API result + - Continue release even if IaaS call fails (fail-open) + +--- + +## Phase 4: Controller GC Integration + +**Goal**: Integrate IaaS API calls into IP Garbage Collection +**User Story**: FR5.4 (GC Integration) +**Independent Test Criteria**: +- GC triggers IaaS release calls +- GC proceeds even if IaaS call fails + +### Implementation Tasks + +- [ ] **T111** Integrate IaaS release into Controller GC + - **File**: Controller GC code (find location) + - **Description**: When GC releases IPs, call IaaS API + - **Requirements**: + - Similar to IPAM Release integration + - Call IaaS before releasing IPs + - Continue GC even if IaaS call fails + +--- + +## Phase 5: Agent Initialization + +**Goal**: Wire IaaS client into Agent +**User Story**: Infrastructure + +### Implementation Tasks + +- [x] **T112** Initialize IaaS client in Agent + - **File**: `cmd/spiderpool-agent/cmd/daemon.go` + - **Description**: Create IaaS client if configured + - **Code**: + ```go + // After config validation + if agentContext.Cfg.IaaSProviderConfig.URL != "" { + iaasClient, err := iaasclient.NewClient(&agentContext.Cfg.IaaSProviderConfig) + if err != nil { + logger.Sugar().Warnf("Failed to create IaaS client: %v", err) + } else { + agentContext.IaaSClient = iaasClient + } + } + ``` + +--- + +## Dependency Graph + +``` +T101 (Package structure) + ├── T102 (mTLS client) + └── T103 (Types) + ├── T104 (Allocate API) + └── T105 (Release API) + +T104 + T105 → T106 (Logging) + +T103 + T106 → T107 (Multus MAC) + T108 (IPAM Allocate) +T108 → T109 (MAC/VLAN storage) + +T105 + T106 → T110 (IPAM Release) +T110 → T111 (GC Integration) + +T101-T106 + T112 → Agent initialization +``` + +--- + +## Parallel Execution + +### Wave 1: API Client (T101-T106) +- T101 → T102, T103 (并行) → T104, T105 (并行) → T106 + +### Wave 2: IPAM Integration (T107-T109) +- T103 → T107, T108 (并行) → T109 + +### Wave 3: Release & GC (T110-T111) +- T105 → T110 → T111 + +### Wave 4: Agent Wiring (T112) +- T106 + T108 + T110 → T112 + +--- + +## Testing Strategy + +### Unit Tests +- T102: mTLS client creation with test certificates +- T104: Allocate API request/response handling +- T105: Release API request handling + +### Integration Tests (Manual) +1. Deploy mock IaaS server +2. Configure Spiderpool to use mock server +3. Create Pod and verify: + - IaaS allocate is called + - MAC/VLAN are stored in SpiderEndpoint +4. Delete Pod and verify: + - IaaS release is called + +--- + +## Task Checklist + +### Phase 2 API Client +- [x] T101 Package structure +- [x] T102 mTLS client +- [x] T103 Request/response types +- [x] T104 Allocate API +- [x] T105 Release API +- [x] T106 Logging + +### Phase 2 IPAM Integration +- [x] T107 Multus MAC parsing +- [x] T108 IPAM Allocate integration +- [x] T109 MAC/VLAN storage + +### Phase 2 Release & GC +- [x] T110 IPAM Release integration +- [ ] T111 GC integration + +### Phase 2 Agent Wiring +- [x] T112 Agent initialization + +--- + +## Next Steps + +After Phase 2 tasks are complete: + +1. End-to-end testing +2. Documentation update +3. PR review and merge +4. Consider Phase 3 (metrics, advanced features) + +--- + +**Ready to implement**: Start with T101 (Package structure) → T102-T103 (并行) diff --git a/specs/003-iaas-provider-integration/tasks.md b/specs/003-iaas-provider-integration/tasks.md new file mode 100644 index 000000000..a5f83af42 --- /dev/null +++ b/specs/003-iaas-provider-integration/tasks.md @@ -0,0 +1,333 @@ +# Tasks: IaaS Network Provider Integration + +**Feature**: IaaS Network Provider Integration +**Branch**: `003-iaas-provider-integration` +**Generated**: 2025-04-27 +**Spec**: [spec.md](./spec.md) +**Plan**: [plan.md](./plan.md) + +--- + +## Overview + +This task list implements Phase 1 of the IaaS Network Provider Integration: Configuration Infrastructure and Secret Mounting. The actual API client and IPAM hooks will be implemented in Phase 2 (future work). + +**Total Tasks**: 10 +**Estimated Effort**: Small to Medium (Helm + Go configuration) +**Parallel Tasks**: T002-T004, T006-T007 + +--- + +## Phase 0: Project Setup + +**Goal**: Ensure all prerequisites are ready for implementation +**Test Criteria**: N/A (setup phase) + +- [ ] T000 Verify existing documentation is complete + - [ ] Review spec.md for clarity + - [ ] Review plan.md for task completeness + - [ ] Review data-model.md for accuracy + +--- + +## Phase 1: Helm Chart Configuration + +**Goal**: Add IaaS provider configuration support to Spiderpool Helm chart +**User Story**: FR1, FR3 (Configuration Infrastructure) +**Independent Test Criteria**: +- `helm template` renders without errors when `iaasNetworkProvider` is configured +- Generated manifests include secret volume mounts when URL is set +- Generated manifests exclude secret volume mounts when URL is empty + +### Implementation Tasks + +- [x] T001 Add `iaasNetworkProvider` values to Helm chart schema + - **File**: `charts/spiderpool/values.yaml` + - **Description**: Add `iaasNetworkProvider` configuration section with `url`, `tlsSecret.name`, `tlsSecret.namespace` + - **Acceptance**: + ```yaml + iaasNetworkProvider: + url: "" + tlsSecret: + name: "" + namespace: "" + ``` + +- [x] T002 [P] Add secret volume mount to Agent DaemonSet template + - **File**: `charts/spiderpool/templates/daemonset.yaml` + - **Description**: Add volume and volumeMount for TLS secret when `iaasNetworkProvider.url` is non-empty + - **Condition**: `{{- if .Values.iaasNetworkProvider.url }}` + - **Mount Path**: `/etc/spiderpool/iaas-tls/` + - **Files**: `tls.crt`, `tls.key` + +- [x] T003 [P] Add secret volume mount to Controller Deployment template + - **File**: `charts/spiderpool/templates/deployment.yaml` + - **Description**: Add volume and volumeMount for TLS secret (same as T002) + - **Condition**: `{{- if .Values.iaasNetworkProvider.url }}` + +- [x] T004 [P] Add IaaS configuration to ConfigMap template + - **File**: `charts/spiderpool/templates/configmap.yaml` (or create if not exists) + - **Description**: Add IaaS provider URL and TLS secret reference as environment variables + - **Variables**: + - `SPIDERPOOL_IAAS_PROVIDER_URL` + - `SPIDERPOOL_IAAS_TLS_SECRET_NAME` + - `SPIDERPOOL_IAAS_TLS_SECRET_NAMESPACE` + - `SPIDERPOOL_IAAS_TLS_CERT_PATH` (hardcoded: `/etc/spiderpool/iaas-tls/tls.crt`) + - `SPIDERPOOL_IAAS_TLS_KEY_PATH` (hardcoded: `/etc/spiderpool/iaas-tls/tls.key`) + +- [ ] T005 Add Helm template tests for IaaS configuration + - **Files**: Test scripts or CI pipeline + - **Description**: Add tests to verify Helm template rendering + - **Test Cases**: + 1. Template renders with empty `iaasNetworkProvider.url` (no volumes) + 2. Template renders with valid `iaasNetworkProvider` config (volumes present) + 3. Template fails or warns with URL set but missing TLS secret config + +--- + +## Phase 2: Go Configuration Types + +**Goal**: Define Go structs and loading logic for IaaS provider configuration +**User Story**: FR3 (Configuration Infrastructure) +**Independent Test Criteria**: +- Configuration structs compile without errors +- Unit tests pass for configuration loading + +### Implementation Tasks + +- [x] T006 [P] Add `IaaSProviderConfig` Go types to config package + - **File**: `pkg/config/config.go` + - **Description**: Define configuration structs + - **Code**: + ```go + type IaaSProviderConfig struct { + URL string `yaml:"url"` + TLSSecret TLSSecretConfig `yaml:"tlsSecret"` + } + + type TLSSecretConfig struct { + Name string `yaml:"name"` + Namespace string `yaml:"namespace"` + } + ``` + +- [x] T007 [P] Add configuration loading to Agent + - **File**: `cmd/spiderpool-agent/cmd/config.go` + - **Description**: Load IaaS provider config from environment variables into global config + - **Variables**: + - `SPIDERPOOL_IAAS_PROVIDER_URL` + - `SPIDERPOOL_IAAS_TLS_SECRET_NAME` + - `SPIDERPOOL_IAAS_TLS_SECRET_NAMESPACE` + +- [x] T008 [P] Add configuration loading to Controller + - **File**: `cmd/spiderpool-controller/cmd/config.go` + - **Description**: Load IaaS provider config (same as T007) + +--- + +## Phase 3: Validation and Integration + +**Goal**: Validate configuration at startup and ensure proper integration +**User Story**: FR3.8-FR3.9 (Startup Validation) +**Independent Test Criteria**: +- Agent starts successfully with valid IaaS config +- Agent logs appropriate messages about IaaS configuration +- Controller starts successfully with valid IaaS config + +### Implementation Tasks + +- [x] T009 Add Agent startup validation for IaaS configuration + - **File**: `cmd/spiderpool-agent/cmd/daemon.go` (or initialization code) + - **Description**: Validate secret existence when URL is configured + - **Validation**: + 1. If URL is empty: skip validation, log "IaaS integration disabled" + 2. If URL is set: validate TLS secret name and namespace are not empty + 3. Optional: Check if secret files exist at mount path + - **Error Handling**: Log warning if validation fails, but don't block startup (fail-open for Phase 1) + +- [x] T010 Add Controller startup validation for IaaS configuration + - **File**: `cmd/spiderpool-controller/cmd/daemon.go` + - **Description**: Same validation as T009 for Controller + +--- + +## Phase 4: Documentation and Examples + +**Goal**: Update documentation with IaaS configuration examples +**User Story**: Documentation and usability +**Independent Test Criteria**: Documentation is accurate and complete + +- [x] T011 Update Helm values documentation + - **File**: `charts/spiderpool/README.md` (or relevant doc) + - **Description**: Add `iaasNetworkProvider` configuration example + - **Include**: + - Configuration syntax + - Secret creation example + - Troubleshooting tips + +- [x] T012 Update quickstart.md with verification steps + - **File**: `specs/003-iaas-provider-integration/quickstart.md` + - **Description**: Add actual verification commands for the implementation + - **Commands**: + - Check environment variables + - Verify secret mount + - Validate certificate files + +--- + +## Dependency Graph + +``` +T001 (Helm values) + ├── T002 (Agent volume mount) + ├── T003 (Controller volume mount) + └── T004 (ConfigMap) + +T001 ──> T005 (Helm tests) +T002 ──> T005 +T003 ──> T005 +T004 ──> T005 + +T006 (Go types) + ├── T007 (Agent config loading) + └── T008 (Controller config loading) + +T007 ──> T009 (Agent validation) +T008 ──> T010 (Controller validation) + +T005, T009, T010 ──> T011, T012 (Docs) +``` + +--- + +## Parallel Execution + +### Wave 1: Helm Chart (T001-T005) +- T001, T002, T003, T004 can be done in parallel after T001 +- T005 depends on T002, T003, T004 + +### Wave 2: Go Implementation (T006-T010) +- T006 can be done in parallel with Wave 1 +- T007, T008 can be done in parallel after T006 +- T009 depends on T007 +- T010 depends on T008 + +### Wave 3: Documentation (T011-T012) +- T011, T012 can be done in parallel after Wave 1 and Wave 2 complete + +--- + +## Suggested MVP Scope + +**MVP = T001-T010** (Full Phase 1) + +Phase 1 is already minimal - it only includes configuration infrastructure. All 10 tasks are required for a complete, testable feature. + +**Optional for MVP**: +- T011, T012 (Documentation) - Can be done immediately after or in parallel + +--- + +## Test Strategy + +### Manual Testing + +1. **Helm Template Test**: + ```bash + helm template spiderpool charts/spiderpool \ + --set iaasNetworkProvider.url="test:444" \ + --set iaasNetworkProvider.tlsSecret.name="test-secret" \ + --set iaasNetworkProvider.tlsSecret.namespace="spiderpool" \ + | grep -A5 "iaas-tls" + ``` + +2. **Secret Mount Test**: + ```bash + # Create test secret + kubectl create secret tls test-secret \ + --cert=/path/to/test.crt \ + --key=/path/to/test.key \ + -n spiderpool + + # Install/upgrade with IaaS config + helm upgrade spiderpool charts/spiderpool \ + --set iaasNetworkProvider.url="test:444" \ + --set iaasNetworkProvider.tlsSecret.name="test-secret" + + # Verify mount + kubectl exec -n spiderpool ds/spiderpool-agent -- ls /etc/spiderpool/iaas-tls/ + ``` + +### Automated Tests + +- T005: Helm template rendering tests +- Go unit tests for configuration loading (T007, T008) +- Integration tests for validation (T009, T010) - optional for Phase 1 + +--- + +## Implementation Notes + +### Important: Phase 1 Scope + +This task list **ONLY** covers configuration infrastructure (Phase 1). The following are **NOT included** and will be Phase 2: + +- HTTP client for IaaS API communication +- IPAM hooks to call IaaS allocate/release APIs +- MAC address storage beyond configuration +- IP Garbage Collection integration + +### Backward Compatibility + +All Phase 1 tasks maintain backward compatibility: +- Empty `iaasNetworkProvider.url` disables all IaaS features +- No changes to existing IPAM logic +- No breaking changes to Helm values + +### Security Considerations + +- TLS certificates are mounted read-only +- Private keys (`tls.key`) are never logged +- Certificate paths are hardcoded to prevent traversal attacks + +--- + +## Task Checklist + +### Pre-Implementation +- [ ] Review spec.md and plan.md +- [ ] Ensure Kubernetes test environment is available +- [ ] Create feature branch: `003-iaas-provider-integration` + +### Implementation +- [ ] T001 Helm values +- [ ] T002 Agent DaemonSet +- [ ] T003 Controller Deployment +- [ ] T004 ConfigMap +- [ ] T005 Helm tests +- [ ] T006 Go types +- [ ] T007 Agent config loading +- [ ] T008 Controller config loading +- [ ] T009 Agent validation +- [ ] T010 Controller validation + +### Post-Implementation +- [ ] T011 Documentation +- [ ] T012 Quickstart update +- [ ] Manual testing on test cluster +- [ ] PR review and merge + +--- + +## Next Steps + +After Phase 1 tasks are complete: + +1. Merge Phase 1 changes +2. Test in staging environment +3. Plan Phase 2: IaaS API client and IPAM hooks +4. Create Phase 2 specification and tasks + +--- + +**Ready to implement**: Start with T001 (Helm values) → T002-T004 (parallel) → T005 (tests) From e3e26bd02ded016837904cfbe1eb221a23fb93bd Mon Sep 17 00:00:00 2001 From: Cyclinder Kuo Date: Fri, 8 May 2026 18:10:57 +0800 Subject: [PATCH 2/3] Add vlan-cni plugin support to spiderpool-plugins image * add VLAN_VERSION build argument and environment variable * clone vlan-cni repository and build vlan binary * copy vlan binary to release image at /usr/plugins/vlan * add --install-vlan flag to entrypoint.sh for conditional installation * add vlan plugin installation logic with version logging * export VLAN_VERSION in GitHub Actions workflow outputs * set default VLAN_VERSION to 0.0.1 in version.sh Signed-off-by: Cyclinder Kuo --- .github/workflows/build-image-ci.yaml | 86 +++++++++++++++++++++- .github/workflows/build-image-plugins.yaml | 2 + charts/spiderpool/README.md | 7 +- charts/spiderpool/templates/daemonset.yaml | 4 +- charts/spiderpool/values.yaml | 5 +- images/spiderpool-plugins/Dockerfile | 11 ++- images/spiderpool-plugins/entrypoint.sh | 14 ++++ images/spiderpool-plugins/version.sh | 2 + 8 files changed, 123 insertions(+), 8 deletions(-) diff --git a/.github/workflows/build-image-ci.yaml b/.github/workflows/build-image-ci.yaml index 54f4b3c94..245505df9 100644 --- a/.github/workflows/build-image-ci.yaml +++ b/.github/workflows/build-image-ci.yaml @@ -7,6 +7,23 @@ env: ONLINE_REGISTER_PASSWORD: ${{ secrets.GITHUB_TOKEN }} on: + workflow_dispatch: + inputs: + ref: + description: 'Git ref to build (branch, tag, or SHA)' + required: false + default: '' + type: string + push: + description: 'Push images to registry' + required: false + default: false + type: boolean + upload_artifacts: + description: 'Upload images as artifacts for download' + required: true + default: true + type: boolean workflow_run: workflows: - "Image CI Cache Cleaner" @@ -63,7 +80,17 @@ jobs: id: tag run: | echo ${{ github.event_name }} - if ${{ inputs.ref != '' }}; then + if ${{ github.event_name == 'workflow_dispatch' }}; then + echo "trigger by workflow_dispatch" + # Use provided ref or default to current commit SHA + REF="${{ github.event.inputs.ref }}" + if [ -z "$REF" ]; then + REF="${{ github.sha }}" + fi + echo "tag=$REF" >> $GITHUB_ENV + echo "push=${{ github.event.inputs.push }}" >> $GITHUB_ENV + echo "upload_artifacts=${{ github.event.inputs.upload_artifacts }}" >> $GITHUB_ENV + elif ${{ inputs.ref != '' }}; then echo "trigger by workflow_call" echo "tag=${{ inputs.ref }}" >> $GITHUB_ENV echo "push=${{ inputs.push }}" >> $GITHUB_ENV @@ -75,10 +102,11 @@ jobs: echo "trigger by pull_request_target" echo "tag=${{ github.event.pull_request.head.sha }}" >> $GITHUB_ENV echo "push=false" >> $GITHUB_ENV - elif ${{ github.event_name == 'workflow_run' }} ; then + elif ${{ github.event_name == 'workflow_run' }}; then echo "trigger by workflow_run" echo "tag=main" >> $GITHUB_ENV echo "push=false" >> $GITHUB_ENV + echo "upload_artifacts=false" >> $GITHUB_ENV else echo "trigger by ${{ github.event_name }}" echo "tag=${{ github.sha }}" >> $GITHUB_ENV @@ -248,6 +276,42 @@ jobs: echo "${{ env.ONLINE_REGISTER }}/${{ github.repository }}/${{ matrix.name }}-ci:${{ env.tag }}@${{ steps.docker_build_ci_pr.outputs.digest }}" > image-digest/${{ matrix.name }}.txt echo "${{ env.ONLINE_REGISTER }}/${{ github.repository }}/${{ matrix.name }}-ci:${{ env.tag }}-race@${{ steps.docker_build_ci_pr_detect_race_condition.outputs.digest }}" >> image-digest/${{ matrix.name }}.txt + # Export amd64 image tar for workflow_dispatch + - name: Export ${{ matrix.name }} amd64 image + if: ${{ github.event_name == 'workflow_dispatch' && env.upload_artifacts == 'true' }} + uses: docker/build-push-action@v6.15.0 + with: + context: . + file: ${{ matrix.dockerfile }} + push: false + platforms: linux/amd64 + outputs: type=docker,dest=/tmp/${{ matrix.name }}-amd64.tar + github-token: ${{ secrets.WELAN_PAT }} + tags: | + ${{ env.ONLINE_REGISTER }}/${{ github.repository }}/${{ matrix.name }}-ci:${{ env.tag }}-amd64 + build-args: | + GIT_COMMIT_VERSION=${{ env.commitver }} + GIT_COMMIT_TIME=${{ env.committime }} + VERSION=${{ env.tag }} + + # Export arm64 image tar for workflow_dispatch + - name: Export ${{ matrix.name }} arm64 image + if: ${{ github.event_name == 'workflow_dispatch' && env.upload_artifacts == 'true' }} + uses: docker/build-push-action@v6.15.0 + with: + context: . + file: ${{ matrix.dockerfile }} + push: false + platforms: linux/arm64 + outputs: type=docker,dest=/tmp/${{ matrix.name }}-arm64.tar + github-token: ${{ secrets.WELAN_PAT }} + tags: | + ${{ env.ONLINE_REGISTER }}/${{ github.repository }}/${{ matrix.name }}-ci:${{ env.tag }}-arm64 + build-args: | + GIT_COMMIT_VERSION=${{ env.commitver }} + GIT_COMMIT_TIME=${{ env.committime }} + VERSION=${{ env.tag }} + # Upload artifact digests - name: Upload artifact digests uses: actions/upload-artifact@v6.0.0 @@ -256,6 +320,24 @@ jobs: path: image-digest retention-days: 1 + # Upload artifact amd64 images tar + - name: Upload artifact ${{ matrix.name }} amd64 tar + if: ${{ github.event_name == 'workflow_dispatch' && env.upload_artifacts == 'true' }} + uses: actions/upload-artifact@v6.0.0 + with: + name: ${{ matrix.name }}-amd64-${{ env.tag }} + path: /tmp/${{ matrix.name }}-amd64.tar + retention-days: 7 + + # Upload artifact arm64 images tar + - name: Upload artifact ${{ matrix.name }} arm64 tar + if: ${{ github.event_name == 'workflow_dispatch' && env.upload_artifacts == 'true' }} + uses: actions/upload-artifact@v6.0.0 + with: + name: ${{ matrix.name }}-arm64-${{ env.tag }} + path: /tmp/${{ matrix.name }}-arm64.tar + retention-days: 7 + # Upload artifact race images tar - name: Upload artifact race image tar uses: actions/upload-artifact@v6.0.0 diff --git a/.github/workflows/build-image-plugins.yaml b/.github/workflows/build-image-plugins.yaml index e9464f826..2f2cd501c 100644 --- a/.github/workflows/build-image-plugins.yaml +++ b/.github/workflows/build-image-plugins.yaml @@ -88,6 +88,7 @@ jobs: echo "SRIOV_VERSION=${SRIOV_VERSION}" >> $GITHUB_OUTPUT echo "IB_SRIOV_VERSION=${IB_SRIOV_VERSION}" >> $GITHUB_OUTPUT echo "IPOIB_VERSION=${IPOIB_VERSION}" >> $GITHUB_OUTPUT + echo "VLAN_VERSION=${VLAN_VERSION}" >> $GITHUB_OUTPUT - name: Login to online register uses: docker/login-action@v3.4.0 @@ -118,6 +119,7 @@ jobs: SRIOV_VERSION=${{ steps.arg.outputs.SRIOV_VERSION }} IB_SRIOV_VERSION=${{ steps.arg.outputs.IB_SRIOV_VERSION }} IPOIB_VERSION=${{ steps.arg.outputs.IPOIB_VERSION }} + VLAN_VERSION=${{ steps.arg.outputs.VLAN_VERSION }} - name: Image Release Digest if: ${{ env == 'false' }} diff --git a/charts/spiderpool/README.md b/charts/spiderpool/README.md index ea624302a..5bc040c87 100644 --- a/charts/spiderpool/README.md +++ b/charts/spiderpool/README.md @@ -224,6 +224,7 @@ helm install spiderpool spiderpool/spiderpool --wait --namespace kube-system \ | `plugins.installSriovCNI` | install sriov cni to each node | `true` | | `plugins.installibSriovCNI` | install ib-sriov cni to each node | `true` | | `plugins.installIpoibCNI` | install ipoib cni to each node | `true` | +| `plugins.installVlanCNI` | install vlan cni to each node | `true` | | `plugins.image.registry` | the image registry of plugins | `ghcr.io` | | `plugins.image.repository` | the image repository of plugins | `spidernet-io/spiderpool/spiderpool-plugins` | | `plugins.image.pullPolicy` | the image pullPolicy of plugins | `IfNotPresent` | @@ -462,6 +463,6 @@ helm install spiderpool spiderpool/spiderpool --wait --namespace kube-system \ ### IaaS Network Provider Integration -| Name | Description | Value | -| ------------------------------- | ----------------------------------------------------------------------------------------- | ----- | -| `iaasNetworkProvider.serverUrl` | the URL of the IaaS provider service (host:port). If empty, IaaS integration is disabled. | `""` | +| Name | Description | Value | +| ------------------------------- | --------------------------------------------------------------------------------------------------------------------------- | ----- | +| `iaasNetworkProvider.serverUrl` | the URL of the IaaS provider service. Must include scheme (http or https) and port. If empty, IaaS integration is disabled. | `""` | diff --git a/charts/spiderpool/templates/daemonset.yaml b/charts/spiderpool/templates/daemonset.yaml index 063b247c3..18c3308ac 100644 --- a/charts/spiderpool/templates/daemonset.yaml +++ b/charts/spiderpool/templates/daemonset.yaml @@ -80,7 +80,7 @@ spec: - linux {{- end }} initContainers: - {{- if or .Values.plugins.installCNI .Values.plugins.installRdmaCNI .Values.plugins.installOvsCNI .Values.plugins.installibSriovCNI .Values.plugins.installIpoibCNI }} + {{- if or .Values.plugins.installCNI .Values.plugins.installRdmaCNI .Values.plugins.installOvsCNI .Values.plugins.installibSriovCNI .Values.plugins.installIpoibCNI .Values.plugins.installVlanCNI }} - name: install-plugins image: {{ include "plugins.image" . | quote }} imagePullPolicy: {{ .Values.plugins.image.pullPolicy }} @@ -97,6 +97,8 @@ spec: value: {{ .Values.plugins.installibSriovCNI | quote }} - name: INSTALL_IPOIB_PLUGIN value: {{ .Values.plugins.installIpoibCNI | quote }} + - name: INSTALL_VLAN_PLUGIN + value: {{ .Values.plugins.installVlanCNI | quote }} command: - "/bin/sh" - "entrypoint.sh" diff --git a/charts/spiderpool/values.yaml b/charts/spiderpool/values.yaml index 11f613ed3..dcf722725 100644 --- a/charts/spiderpool/values.yaml +++ b/charts/spiderpool/values.yaml @@ -307,6 +307,9 @@ plugins: ## @param plugins.installIpoibCNI install ipoib cni to each node installIpoibCNI: true + ## @param plugins.installVlanCNI install vlan cni to each node + installVlanCNI: true + image: ## @param plugins.image.registry the image registry of plugins registry: ghcr.io @@ -1067,5 +1070,5 @@ sriov: ## @section IaaS Network Provider Integration ## iaasNetworkProvider: - ## @param iaasNetworkProvider.serverUrl the URL of the IaaS provider service (e.g. http://host:port or https://host:port). Must include scheme. If empty, IaaS integration is disabled. + ## @param iaasNetworkProvider.serverUrl the URL of the IaaS provider service. Must include scheme (http or https) and port. If empty, IaaS integration is disabled. serverUrl: "" diff --git a/images/spiderpool-plugins/Dockerfile b/images/spiderpool-plugins/Dockerfile index 6605b6791..c3063849b 100644 --- a/images/spiderpool-plugins/Dockerfile +++ b/images/spiderpool-plugins/Dockerfile @@ -20,6 +20,8 @@ ARG SRIOV_VERSION ENV SRIOV_VERSION=${SRIOV_VERSION} ARG IPOIB_VERSION ENV IPOIB_VERSION=${IPOIB_VERSION} +ARG VLAN_VERSION +ENV VLAN_VERSION=${VLAN_VERSION} WORKDIR /src @@ -29,7 +31,8 @@ RUN touch VERSION.sh && \ printf "export IB_SRIOV_VERSION=%s\n" "${IB_SRIOV_VERSION}" >> VERSION.sh && \ printf "export SRIOV_VERSION=%s\n" "${SRIOV_VERSION}" >> VERSION.sh && \ printf "export IPOIB_VERSION=%s\n" "${IPOIB_VERSION}" >> VERSION.sh && \ - printf "export RDMA_VERSION=%s\n" "${RDMA_VERSION}" >> VERSION.sh + printf "export RDMA_VERSION=%s\n" "${RDMA_VERSION}" >> VERSION.sh && \ + printf "export VLAN_VERSION=%s\n" "${VLAN_VERSION}" >> VERSION.sh RUN mkdir -p /src/cni/bin && \ curl -L -O https://github.com/containernetworking/plugins/releases/download/${CNI_VERSION}/cni-plugins-linux-${TARGETARCH}-${CNI_VERSION}.tgz && \ @@ -41,6 +44,7 @@ RUN git clone -b ${OVS_VERSION} --depth 1 https://github.com/k8snetworkplumbingw RUN git clone https://github.com/k8snetworkplumbingwg/ib-sriov-cni.git RUN git clone https://github.com/k8snetworkplumbingwg/sriov-cni.git RUN git clone https://github.com/Mellanox/ipoib-cni.git +RUN git clone https://github.com/spidernet-io/vlan-cni.git WORKDIR /src/rdma-cni RUN git checkout ${RDMA_VERSION} && make TARGET_ARCH=${TARGETARCH} \ @@ -58,6 +62,9 @@ RUN git checkout ${SRIOV_VERSION} && go mod vendor && mkdir -p build && GOOS=${T WORKDIR /src/ipoib-cni RUN git checkout ${IPOIB_VERSION} && make build && [ -f "build/ipoib" ] +WORKDIR /src/vlan-cni +RUN git checkout ${VLAN_VERSION} && mkdir -p bin && GOOS=${TARGETOS} GOARCH=${TARGETARCH} CGO_ENABLED=0 go build -v -ldflags="-s -w" -o bin/vlan ./cmd/vlan + #===== release image ========= FROM alpine:3 @@ -79,6 +86,7 @@ ENV SRIOV_BIN_PATH="/usr/plugins/sriov" ENV IB_SRIOV_BIN_PATH="/usr/plugins/ib-sriov" ENV CNI_BIN_DIR="/usr/plugins/cni" ENV IPOIB_BIN_PATH="/usr/plugins/ipoib" +ENV VLAN_BIN_PATH="/usr/plugins/vlan" ENV VERSION_FILE_PATH="/usr/plugins/VERSION.sh" WORKDIR / @@ -87,6 +95,7 @@ COPY --from=builder /src/ovs-cni/build/ovs ${OVS_BIN_PATH} COPY --from=builder /src/sriov-cni/build/sriov ${SRIOV_BIN_PATH} COPY --from=builder /src/ib-sriov-cni/build/ib-sriov ${IB_SRIOV_BIN_PATH} COPY --from=builder /src/ipoib-cni/build/ipoib ${IPOIB_BIN_PATH} +COPY --from=builder /src/vlan-cni/bin/vlan ${VLAN_BIN_PATH} COPY --from=builder /src/cni/bin/ ${CNI_BIN_DIR} COPY --from=builder /src/VERSION.sh ${VERSION_FILE_PATH} diff --git a/images/spiderpool-plugins/entrypoint.sh b/images/spiderpool-plugins/entrypoint.sh index 9f592fc09..9ed486a90 100755 --- a/images/spiderpool-plugins/entrypoint.sh +++ b/images/spiderpool-plugins/entrypoint.sh @@ -15,6 +15,7 @@ function usage() echo -e "--install-ib-sriov enable install ib-sriov" echo -e "--install-ipoib enable install ipoib" echo -e "--install-rdma enable install rdma-plugin" + echo -e "--install-vlan enable install vlan-plugin" echo -e "--copy-dst-dir specifies the path to these plugins installed" } @@ -26,6 +27,7 @@ echo "IB_SRIOV_BIN_PATH=${IB_SRIOV_BIN_PATH}" echo "CNI_BIN_DIR=${CNI_BIN_DIR}" echo "IPOIB_BIN_PATH=${IPOIB_BIN_PATH}" echo "VERSION_FILE_PATH=${VERSION_FILE_PATH}" +echo "VLAN_BIN_PATH=${VLAN_BIN_PATH}" echo "RDMA_BIN_PATH=${RDMA_BIN_PATH}" [ -f "${RDMA_BIN_PATH}" ] || { echo "error, failed to find ${RDMA_BIN_PATH}" ; exit 1 ; } @@ -33,6 +35,7 @@ echo "RDMA_BIN_PATH=${RDMA_BIN_PATH}" [ -f "${SRIOV_BIN_PATH}" ] || { echo "error, failed to find ${SRIOV_BIN_PATH}" ; exit 1 ; } [ -f "${IB_SRIOV_BIN_PATH}" ] || { echo "error, failed to find ${IB_SRIOV_BIN_PATH}" ; exit 1 ; } [ -f "${IPOIB_BIN_PATH}" ] || { echo "error, failed to find ${IPOIB_BIN_PATH}" ; exit 1 ; } +[ -f "${VLAN_BIN_PATH}" ] || { echo "error, failed to find ${VLAN_BIN_PATH}" ; exit 1 ; } [ -f "${VERSION_FILE_PATH}" ] || { echo "error, failed to find ${VERSION_FILE_PATH}" ; exit 1 ; } [ -d "${CNI_BIN_DIR}" ] || { echo "error, failed to find ${CNI_BIN_DIR}" ; exit 1 ; } @@ -43,6 +46,7 @@ INSTALL_RDMA_PLUGIN=${INSTALL_RDMA_PLUGIN:-false} INSTALL_SRIOV_PLUGIN=${INSTALL_SRIOV_PLUGIN:-false} INSTALL_IB_SRIOV_PLUGIN=${INSTALL_IB_SRIOV_PLUGIN:-false} INSTALL_IPOIB_PLUGIN=${INSTALL_IPOIB_PLUGIN:-false} +INSTALL_VLAN_PLUGIN=${INSTALL_VLAN_PLUGIN:-false} INSTALL_CNI_PLUGINS=${INSTALL_CNI_PLUGINS:-false} mkdir -p ${COPY_DST_DIR} || true @@ -110,4 +114,14 @@ else echo "skip installing ipoib" fi +if [ "$INSTALL_VLAN_PLUGIN" = "true" ]; then + echo "Installing vlan: ${VLAN_VERSION}" + rm -f ${COPY_DST_DIR}/vlan.old || true + ( [ -f "${COPY_DST_DIR}/vlan" ] && mv ${COPY_DST_DIR}/vlan ${COPY_DST_DIR}/vlan.old ) || true + cp ${VLAN_BIN_PATH} ${COPY_DST_DIR} + rm -f ${COPY_DST_DIR}/vlan.old &>/dev/null || true +else + echo "skip installing vlan" +fi + echo Done. diff --git a/images/spiderpool-plugins/version.sh b/images/spiderpool-plugins/version.sh index 9bef61b3a..15d963096 100644 --- a/images/spiderpool-plugins/version.sh +++ b/images/spiderpool-plugins/version.sh @@ -17,3 +17,5 @@ export SRIOV_VERSION=${SRIOV_VERSION:-"v2.10.0"} export IB_SRIOV_VERSION=${IB_SRIOV_VERSION:-"v1.3.0"} # https://github.com/Mellanox/ipoib-cni export IPOIB_VERSION=${IPOIB_VERSION:-"v1.2.2"} +# https://github.com/spidernet-io/vlan-cni +export VLAN_VERSION=${VLAN_VERSION:-"0.0.1"} From 32c5e524121360c9058fbc8c643316885a0a43e0 Mon Sep 17 00:00:00 2001 From: Cyclinder Kuo Date: Thu, 14 May 2026 15:37:23 +0800 Subject: [PATCH 3/3] fix: move link setup from CmdAdd to DetectIPConflictAndGatewayReachable * move netlink link setup logic from command_add.go to ipam_detection.go * set link up inside DetectIPConflictAndGatewayReachable before detection * change early return to continue when IP version is nil during detection * remove netlink import from command_add.go Signed-off-by: Cyclinder Kuo --- AGENTS.md | 33 +++ CODEOWNERS | 1 + README-zh_CN.md | 2 +- README.md | 2 +- charts/spiderpool/templates/configmap.yaml | 2 +- cmd/spiderpool/cmd/command_add.go | 20 -- cmd/spiderpool/cmd/command_delete.go | 2 +- docs/mkdocs.yml | 1 + docs/usage/iaas-network-provider-zh_CN.md | 274 +++++++++++++++++++ docs/usage/iaas-network-provider.md | 278 ++++++++++++++++++++ docs/usage/readme-zh_CN.md | 2 + docs/usage/readme.md | 2 + images/spiderpool-plugins/version.sh | 2 +- pkg/gcmanager/scanAll_IPPool.go | 5 +- pkg/gcmanager/tracePod_worker.go | 39 +-- pkg/iaas/client/client.go | 88 ++----- pkg/iaas/client/types.go | 17 +- pkg/ipam/allocate.go | 3 +- pkg/ipam/iaas.go | 189 +++++++------ pkg/ipam/ipam.go | 6 - pkg/ipam/release.go | 8 +- pkg/networking/networking/ipam_detection.go | 24 +- test/scripts/install-kdoctor.sh | 3 +- 23 files changed, 772 insertions(+), 231 deletions(-) create mode 100644 AGENTS.md create mode 100644 docs/usage/iaas-network-provider-zh_CN.md create mode 100644 docs/usage/iaas-network-provider.md diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 000000000..8f338aa89 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,33 @@ +# Repository Guidelines + +## Project Structure & Module Organization + +Spiderpool is a Go Kubernetes networking project. Main binaries live in `cmd/`, reusable packages in `pkg/`, and Kubernetes APIs, generated clients, and OpenAPI specs in `api/`. Helm packaging is under `charts/spiderpool/`; container build assets are in `images/`. End-to-end assets and cluster scripts live in `test/`, documentation in `docs/`, design/spec work in `specs/`, and shared automation in `tools/` and `contrib/`. Avoid editing `vendor/` directly unless dependency vendoring is the explicit task. + +## Build, Test, and Development Commands + +- `make build-bin`: build Spiderpool binaries into the local output path. +- `make install-bin`: install built binaries. +- `make build_image`: build Docker images with buildx using the current commit version. +- `make build_docker_image`: local Docker fallback when buildx has pull issues. +- `make dev-doctor`: verify Go and required e2e tools such as Docker, kubectl, kind, and p2ctl. +- `make gofmt`: run `go fmt` on Go packages. +- `make lint-golang`: run format checks, lock checks, `go vet`, and `golangci-lint`. +- `make manifests generate-k8s-api`: regenerate CRDs/RBAC/webhooks and deepcopy code. +- `make openapi-code-gen`: regenerate OpenAPI clients from `api/v1/*/openapi.yaml`. + +## Coding Style & Naming Conventions + +Use Go 1.25 as declared in `go.mod`. Keep Go code `gofmt`/`gofumpt` clean and satisfy `.golangci.yaml` linters: `govet`, `errcheck`, `staticcheck`, `ineffassign`, and `errorlint`. Package names are lowercase and directory-oriented, for example `pkg/ippoolmanager` and `pkg/workloadendpointmanager`. Tests use `_test.go`; suite files follow `*_suite_test.go`. + +## Testing Guidelines + +Unit tests use Ginkgo v2 and Gomega. Run `make unittest-tests` for package and command tests; it also checks that non-suite test files include a Ginkgo `Label(...)`. For e2e work, build or pull images first, then use targets such as `make e2e_init_spiderpool` and `make e2e_test_spiderpool`. Narrow e2e runs with `E2E_GINKGO_LABELS=smoke` or `GINKGO_OPTION="--label-filter=CaseLabel"`. + +## Commit & Pull Request Guidelines + +History uses short imperative subjects with optional scopes, such as `fix: ...`, `test: ...`, `CI: ...`, `charts: ...`, and release bumps. Keep commits focused and sign them when following the contribution docs (`git commit -s`). PRs should link issues with `Fixes #...`, state unit or e2e coverage, mention docs impact, include reviewer notes when needed, and fill the release-note block with either content or `NONE`. Apply one release label: `release/none`, `release/bug`, or `release/feature`. + +## Agent-Specific Instructions + +Before changing generated Kubernetes or OpenAPI files, update the source definitions and run the matching generation or verify target. Do not revert unrelated local changes; this repository may contain concurrent contributor work. diff --git a/CODEOWNERS b/CODEOWNERS index 885c0a48a..0d59f2b8f 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -31,3 +31,4 @@ /.openhands @weizhoublue /specs/ @weizhoublue @cyclinder /.dockerignore @weizhoublue @cyclinder +/AGENTS.md @weizhoublue @cyclinder diff --git a/README-zh_CN.md b/README-zh_CN.md index c2dbda0af..471e83bb4 100644 --- a/README-zh_CN.md +++ b/README-zh_CN.md @@ -61,7 +61,7 @@ underlay CNI 主要指 macvlan、ipvlan、SR-IOV 等能够直接访问宿主机 * 基于 CRD 的双栈 IPAM 能力 - 提供了独享、共享的 IP 地址池,支持设置各种亲和性,为中间件等有状态应用和 kubevirt 等固定 IP 地址值,为无状态应用固定 IP 地址范围,自动化管理独享的 IP 池,优秀的 IP 回收避免 IP 泄露等。并且,具备优秀的 [IPAM 分配性能](./docs/concepts/ipam-performance-zh_CN.md)。 + 提供了独享、共享的 IP 地址池,支持设置各种亲和性,为中间件等有状态应用和 kubevirt 等固定 IP 地址值,支持云上 IaaS 网络提供商绑定/释放云侧 IP 资源,为无状态应用固定 IP 地址范围,自动化管理独享的 IP 池,优秀的 IP 回收避免 IP 泄露等。并且,具备优秀的 [IPAM 分配性能](./docs/concepts/ipam-performance-zh_CN.md)。 Spiderpool IPAM 组件能够为任何支持第三方 IPAM 的 main CNI 使用,不仅包含了 [Macvlan CNI](https://github.com/containernetworking/plugins/tree/main/plugins/main/macvlan)、[ipvlan CNI](https://github.com/containernetworking/plugins/tree/main/plugins/main/ipvlan) 和 [SR-IOV CNI](https://github.com/k8snetworkplumbingwg/sriov-cni), 也包括了 [calico](https://github.com/projectcalico/calico) 和 [weave](https://github.com/weaveworks/weave) 作为静态 IP 场景使用。 diff --git a/README.md b/README.md index e1fe779e4..c10035cf0 100644 --- a/README.md +++ b/README.md @@ -63,7 +63,7 @@ The underlay CNI is mainly including macvlan, ipvlan, and SR-IOV, which cloud ac * CRD-based dual-stack IPAM - Spiderpool provides exclusive and shared IP address pools, supporting various affinity settings. It supports to assign static IP addresses for stateful applications such as [mysql](https://www.mysql.com), [redis](https://github.com/redis/redis), [kubevirt](https://github.com/kubevirt/kubevirt), while enabling fixed IP address ranges for stateless ones. Spiderpool automates the management of exclusive IP pools, ensuring excellent IP reclamation to avoid IP leakage. In additions, it provides [wonderful IPAM performance](./docs/concepts/ipam-performance.md). + Spiderpool provides exclusive and shared IP address pools, supporting various affinity settings. It supports to assign static IP addresses for stateful applications such as [mysql](https://www.mysql.com), [redis](https://github.com/redis/redis), [kubevirt](https://github.com/kubevirt/kubevirt), while enabling fixed IP address ranges for stateless ones. It also supports binding/releasing cloud-side IP resources with a generic IaaS Network Provider. Spiderpool automates the management of exclusive IP pools, ensuring excellent IP reclamation to avoid IP leakage. In additions, it provides [wonderful IPAM performance](./docs/concepts/ipam-performance.md). The IPAM of Spiderpool could be available for any main CNI supporting third-party IPAM plugin, not only including [Macvlan CNI](https://github.com/containernetworking/plugins/tree/main/plugins/main/macvlan), [ipvlan CNI](https://github.com/containernetworking/plugins/tree/main/plugins/main/ipvlan), and [SR-IOV CNI](https://github.com/k8snetworkplumbingwg/sriov-cni), but also [calico](https://github.com/projectcalico/calico) and [weave](https://github.com/weaveworks/weave) as static IP usage. diff --git a/charts/spiderpool/templates/configmap.yaml b/charts/spiderpool/templates/configmap.yaml index c9cc7946b..75a053f87 100644 --- a/charts/spiderpool/templates/configmap.yaml +++ b/charts/spiderpool/templates/configmap.yaml @@ -37,7 +37,7 @@ data: namespacesExclude: {{ toJson .Values.spiderpoolController.podResourceInject.namespacesExclude }} namespacesInclude: {{ toJson .Values.spiderpoolController.podResourceInject.namespacesInclude }} iaasNetworkProvider: - serverUrl: {{ .Values.iaasNetworkProvider.serverUrl | quote }} + serverUrl: {{ (.Values.iaasNetworkProvider).serverUrl | default "" | quote }} {{- if .Values.multus.multusCNI.install }} --- kind: ConfigMap diff --git a/cmd/spiderpool/cmd/command_add.go b/cmd/spiderpool/cmd/command_add.go index 77b1c8c81..5176e95fb 100644 --- a/cmd/spiderpool/cmd/command_add.go +++ b/cmd/spiderpool/cmd/command_add.go @@ -16,7 +16,6 @@ import ( current "github.com/containernetworking/cni/pkg/types/100" "github.com/containernetworking/plugins/pkg/ns" "github.com/go-openapi/strfmt" - "github.com/vishvananda/netlink" "go.uber.org/multierr" "go.uber.org/zap" @@ -66,25 +65,6 @@ func CmdAdd(args *skel.CmdArgs) (err error) { return fmt.Errorf("failed to setup file logging: %w", err) } - // When IPAM is invoked, the NIC is down and must be set it up in order to detect IP conflicts and - // gateway reachability. - err = netns.Do(func(netNS ns.NetNS) error { - l, err := netlink.LinkByName(args.IfName) - if err != nil { - return fmt.Errorf("failed to get link: %w", err) - } - - if err = netlink.LinkSetUp(l); err != nil { - return fmt.Errorf("failed to set link up: %w", err) - } - - logger.Sugar().Debugf("Set link %s to up for IP conflict and gateway detection", args.IfName) - return nil - }) - if err != nil { - return fmt.Errorf("failed to set link up: %w", err) - } - hostNs, err := ns.GetCurrentNS() if err != nil { return fmt.Errorf("failed to get current netns: %w", err) diff --git a/cmd/spiderpool/cmd/command_delete.go b/cmd/spiderpool/cmd/command_delete.go index 2ab1cd726..ea572b350 100644 --- a/cmd/spiderpool/cmd/command_delete.go +++ b/cmd/spiderpool/cmd/command_delete.go @@ -89,7 +89,7 @@ func CmdDel(args *skel.CmdArgs) (err error) { return err } - ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) + ctx, cancel := context.WithTimeout(context.Background(), 100*time.Second) defer cancel() params := daemonset.NewDeleteIpamIPParams(). diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml index f27526705..81d475426 100644 --- a/docs/mkdocs.yml +++ b/docs/mkdocs.yml @@ -100,6 +100,7 @@ nav: - IPoIB For Infiniband: usage/ipoib.md - Multi-Cluster Networking: usage/submariner.md - Access Service for Underlay CNI: usage/underlay_cni_service.md + - IaaS Network Provider: usage/iaas-network-provider.md - Bandwidth Manage for IPVlan CNI: usage/ipvlan_bandwidth.md - Kubevirt: usage/kubevirt.md - Istio: usage/istio.md diff --git a/docs/usage/iaas-network-provider-zh_CN.md b/docs/usage/iaas-network-provider-zh_CN.md new file mode 100644 index 000000000..3adcd5ffb --- /dev/null +++ b/docs/usage/iaas-network-provider-zh_CN.md @@ -0,0 +1,274 @@ +# IaaS Network Provider + +[**English**](./iaas-network-provider.md) | **简体中文** + +## 概述 + +Spiderpool 支持对接通用的 IaaS Network Provider。当 Spiderpool 分配或释放 Pod IP 地址时,可以调用配置的 Provider,在云平台侧完成对应 IaaS IP 资源的绑定或解绑。 + +该能力适用于公有云或私有云环境。在这些环境中,Spiderpool 分配出的 IP 地址可能还需要在外部云网络系统中完成注册、绑定或转发面配置后,Pod 才能正常使用。 + +典型使用场景包括: + +- 从云平台申请辅助 IP 资源。 +- 将 IP 绑定到节点、ENI、辅助网卡、VLAN 子接口或其它云网络资源。 +- 向 Spiderpool 返回 Pod 网卡所需的 MAC 地址、VLAN ID 等云平台属性。 +- 当 Spiderpool 释放 Pod IP 时,同步释放 IaaS 侧的 IP 绑定关系。 + +## 工作原理 + +启用该能力后,Spiderpool 会执行以下流程: + +1. Pod IP 分配阶段,Spiderpool 先从 Spiderpool IP 池中分配 IP,然后调用 IaaS Network Provider 的分配接口。 +2. IaaS Network Provider 在云平台侧完成 IP 绑定,并返回云平台侧的网络属性。 +3. Spiderpool 将返回的 MAC 地址和 VLAN ID 写入分配结果,后续 VLAN CNI 流程使用这些信息配置 Pod 网卡。 +4. Pod IP 释放阶段,Spiderpool 会针对每个需要释放的 IPv4 地址调用 IaaS Network Provider 的释放接口。 +5. IaaS 释放接口调用成功后,Spiderpool 再从内部 IP 池中释放该 IP。这里的“调用成功”代表 IaaS Network Provider 已成功接收释放请求并开始云平台侧清理,并不保证云平台侧 IP 资源已经彻底释放完成(云平台可能因限速或异步机制仍在处理)。 + +IaaS Network Provider 是一个 HTTP 服务。Spiderpool 只定义通用 API 契约,不依赖某个具体云厂商实现。 + +## 使用方式 + +通过 Helm values 配置 Provider URL: + +```yaml +ipam: + enableGatewayDetection: false + enableIPConflictDetection: false +plugins: + installVlanCNI: true +iaasNetworkProvider: + serverUrl: "http://iaas-network-provider.iaas-network-provider-system.svc:80" +``` + +- 如果 `iaasNetworkProvider.serverUrl` 为空,Spiderpool 不会调用 IaaS Network Provider。 +- 必须同时启用 `plugins.installVlanCNI`。 +- 必须关闭 `ipam.enableGatewayDetection` 和 `ipam.enableIPConflictDetection` 关闭网关可达性检测和 IP 冲突检测。此模式和传统先调用 CNI 后调用 IPAM 方式不同,必须先调用 IPAM 获取 Iaas IP 信息才能调用 CNI 完成 Pod 网络设置。所以网关可达性检测和 IP 冲突检测在此模式下无法工作。 + +> **注意**:[VLAN-CNI](https://github.com/spidernet-io/vlan-cni) 是 Spiderpool 基于社区 cni-plugin 项目开发的 VLAN CNI 插件,用于对接第三方云平台 IaaS Network Provider,为容器创建 IaaS 层的 VLAN 子网卡。 + +### 检查功能是否已启用 + +安装后可以通过以下方式确认该功能是否已生效: + +1. **查看 ConfigMap** + + ```bash + kubectl get configmap spiderpool-conf -n -o yaml | grep iaasNetworkProvider + ``` + + 如果输出中包含 `iaasNetworkProvider.serverUrl` 且值非空,说明功能已启用。 + +2. **查看 agent 启动日志** + + ```bash + kubectl logs spiderpool-agent-xxx -n + ``` + + 在 agent 启动日志中搜索 `IaaS client created successfully`。如果看到该日志,说明 agent 已成功初始化 IaaS client,功能已启用。如果看到 `IaaS provider configuration validation failed`,说明配置存在问题,需要检查 `serverUrl` 格式是否正确。 + +### 配置 VLAN CNI + +对接 IaaS Network Provider 时,必须使用 VLAN CNI 为 Pod 创建 VLAN 子接口,并将云平台分配的 VLAN ID 和 MAC 地址等属性配置到该子接口上,以确保 Pod 网卡配置与云平台侧保持一致,从而实现正常通信。 + +如果手动静态配置 VLAN ID,将与云平台动态分配的 VLAN ID 不一致,导致网络通信异常。因此 **SpiderMultusConfig 的 `vlan` 配置中不能填写 `vlanID`**,否则 [vlan-cni](https://github.com/spidernet-io/vlan-cni) 将无法为 Pod 创建配置正确的 VLAN 子接口。 + +> [vlan-cni](https://github.com/spidernet-io/vlan-cni) 在 Pod 创建时通过 Unix socket 向本地 spiderpool-agent 查询从 IaaS 分配的 VLAN ID 和 MAC 地址等信息,然后基于这些信息在 Pod 网络命名空间中创建 VLAN 子接口。 + +平台管理员需要提前在 IaaS 侧完成以下准备: + +- 创建 VPC 子网并绑定到节点弹性网卡。例如,将 VPC 子网 `172.91.0.0/24` 绑定到节点 ECS-01 的网卡 `enp0s28`。 + +然后在 PaaS 侧创建对应的 SpiderMultusConfig 和 SpiderIPPool 资源,示例如下: + +```yaml +apiVersion: spiderpool.spidernet.io/v2beta1 +kind: SpiderMultusConfig +metadata: + name: iaas-vlan-config + namespace: spiderpool +spec: + cniType: vlan + vlan: + master: + - enp0s28 + ippools: + ipv4: + - pool-enp0s28 +--- +apiVersion: spiderpool.spidernet.io/v2beta1 +kind: SpiderIPPool +metadata: + name: pool-enp0s28 +spec: + gateway: 172.91.0.1 + ips: + - 172.91.0.100-172.91.0.120 + subnet: 172.91.0.0/24 +``` + +- `master` 为必填字段,必须与节点上的物理网卡名称一致,且要求集群内各节点的网卡名称保持统一。 +- `subnet` 为必填字段,必须与云平台侧的 VPC 子网保持一致。 + +## API 契约 + +Provider 需要实现以下 HTTP API。 + +### 分配 IP + +#### 请求 + +```text +POST /v1/apis/network.iaas.io/ipam/allocate-ips +Content-Type: application/json +``` + +请求体: + +```json +{ + "podName": "example-pod", + "podNamespace": "default", + "podUID": "9f8b7c6d-xxxx-xxxx-xxxx-xxxxxxxxxxxx", + "nodeName": "worker-1", + "iaasIPsAllocationRequest": [ + { + "ipAddress": "10.0.0.10", + "subnet": "10.0.0.0/24", + "parentNicMac": "fa:16:3e:11:22:33" + } + ] +} +``` + +字段说明: + +| 字段 | 是否必填 | 说明 | +| --- | --- | --- | +| `podName` | 否 | Pod 名称。 | +| `podNamespace` | 否 | Pod 命名空间。 | +| `podUID` | 否 | Pod UID。 | +| `nodeName` | 是 | Pod 所在节点。 | +| `iaasIPsAllocationRequest` | 是 | Spiderpool 已分配、期望 Provider 绑定的 IP 列表。 | +| `ipAddress` | 是 | 不带 CIDR 前缀的 IP 地址。 | +| `subnet` | 是 | IP 所属的子网 CIDR。 | +| `parentNicMac` | 是 | 承载该 Pod 网络的父网卡 MAC 地址。 | + +#### 响应 + +任意 HTTP `2xx` 状态码都会被 Spiderpool 视为成功。 + +响应体: + +```json +{ + "podName": "example-pod", + "podNamespace": "default", + "nodeName": "worker-1", + "iaasIPsAllocationResponse": [ + { + "parentNicMac": "fa:16:3e:11:22:33", + "subnet": "10.0.0.0/24", + "ipAddress": "10.0.0.10", + "macAddress": "fa:16:3e:aa:bb:cc", + "vlanId": 100 + } + ] +} +``` + +字段说明: + +| 字段 | 是否必填 | 说明 | +| --- | --- | --- | +| `iaasIPsAllocationResponse` | 是 | Provider 返回的分配结果列表。 | +| `parentNicMac` | 是 | Provider 使用的父网卡 MAC 地址。 | +| `subnet` | 是 | IP 所属的子网 CIDR。 | +| `ipAddress` | 是 | Provider 已完成绑定的 IP 地址。 | +| `macAddress` | 否 | 云平台为 Pod 网卡分配的 MAC 地址。 | +| `vlanId` | 否 | 云平台分配的 VLAN ID。 | + +如果 `macAddress` 或 `vlanId` 为空,Spiderpool 会保留原始分配结果中的对应字段。 + +### 释放 IP + +#### 请求 + +```text +POST /v1/apis/network.iaas.io/ipam/release-ip +Content-Type: application/json +``` + +请求体: + +```json +{ + "podName": "example-pod", + "podNamespace": "default", + "podUID": "9f8b7c6d-xxxx-xxxx-xxxx-xxxxxxxxxxxx", + "nodeName": "worker-1", + "parentNicMac": "fa:16:3e:11:22:33", + "subnet": "10.0.0.0/24", + "ipAddress": "10.0.0.10" +} +``` + +字段说明: + +| 字段 | 是否必填 | 说明 | +| --- | --- | --- | +| `podName` | 否 | Pod 名称。 | +| `podNamespace` | 否 | Pod 命名空间。 | +| `podUID` | 否 | Pod UID。 | +| `nodeName` | 是 | Pod 原本所在节点。 | +| `parentNicMac` | 否 | 父网卡 MAC 地址。在 controller 侧 GC 场景下可能为空。 | +| `subnet` | 是 | IP 所属的子网 CIDR。 | +| `ipAddress` | 是 | 需要释放的 IP 地址。 | + +#### 响应 + +Spiderpool 会忽略响应体。任意 HTTP `2xx` 状态码都会被视为成功。 + +## 特殊场景处理 + +### 分配接口必须同步成功 + +Spiderpool 在分配 IP 时采用同步调用方式:只有 Provider 完成 IaaS 侧 IP 绑定并正常返回网络配置后,Spiderpool 才会更新该 IP 在 SpiderIPPool 中的状态,并创建或更新对应的 SpiderEndpoint 对象。 + +在一些异常场景下: + +- 如果 Provider 或云平台对 API 进行限流,处理时间过长导致 Spiderpool 等待 HTTP 响应超时,本次分配将被视为失败。 +- 如果 Provider 侧故障无法响应,Spiderpool 会等待超时时间后将本次分配视为失败。 + +如果 Spiderpool-agent 在指定时间内(2 min)没有收到 Provider 的成功响应,那么本次分配将被视为失败, 会阻止 Pod 创建,Pod 会遵循 K8s 的重试机制进行重试。 + +### 释放接口应该具备幂等性 + +释放接口应该是幂等的。如果 IP 已经释放,或者云平台侧已经不存在该 IP 绑定关系,只要可以安全地认为该 IP 已释放,Provider 就应该返回 `2xx` 状态码。 + +这样可以避免 CNI DEL 重复调用或 GC 重试时产生不必要的失败。 + +### 释放操作支持最终一致 + +某些云平台的 IP 释放操作较慢,受限速或异步清理机制影响,Provider 收到释放请求后,云平台侧资源不一定立即完成清理。 + +Spiderpool 要求 Provider 能够接收释放请求并启动云平台侧清理流程。只要释放请求已被接受,或 IP 已处于已释放状态,Provider 即可返回成功。 + +Spiderpool 会先调用 IaaS 释放接口,再释放 Spiderpool 内部 IP 池中的 IP。这个顺序可以避免 Spiderpool 在云平台尚未接受释放请求前重新分配同一个 IP。如果云平台在此之后异步完成最终清理,不会阻塞 Spiderpool 当前的 IP 释放流程。 + +### 父网卡 MAC 地址 + +当 Spiderpool 能够解析父网卡 MAC 地址时,会在请求中携带 `parentNicMac`。在 agent 侧的分配和释放场景下,Spiderpool 通常可以通过运行时网络环境或本地缓存获取该值。 + +在 controller 侧 GC 场景中,Spiderpool 不一定运行在各节点的 host network namespace 中,因此可能无法获取父网卡 MAC 地址。此时,Spiderpool 发送的释放请求中 `parentNicMac` 字段可能为空,Provider 的释放接口需要能够容忍该字段缺失。 + +## 异常场景处理 + +Spiderpool 会将以下情况视为失败: + +- HTTP 请求失败。 +- HTTP 响应状态码不是 `2xx`。 +- 分配响应 JSON 无法解析。 +- 分配响应中包含 Spiderpool 未请求的 IP。 + +当释放失败时,Spiderpool 可能根据触发释放的路径,在后续清理流程中进行重试。因此 Provider 的释放接口应支持幂等重试。 diff --git a/docs/usage/iaas-network-provider.md b/docs/usage/iaas-network-provider.md new file mode 100644 index 000000000..21e6b5ea0 --- /dev/null +++ b/docs/usage/iaas-network-provider.md @@ -0,0 +1,278 @@ +# IaaS Network Provider + +**English** | [**简体中文**](./iaas-network-provider-zh_CN.md) + +## Overview + +Spiderpool can integrate with a generic IaaS Network Provider. When Spiderpool allocates or releases Pod IP addresses, it calls the configured provider to bind or unbind the corresponding IaaS-side IP resources on a cloud platform. + +This feature is useful for public cloud or private cloud environments where an IP address assigned by Spiderpool must also be registered, bound, or programmed in an external cloud network system before the Pod can use it correctly. + +Typical use cases include: + +- Allocating auxiliary IP resources from a cloud platform. +- Binding an IP to a node, ENI, auxiliary network interface, VLAN sub-interface, or other cloud networking resource. +- Returning cloud-specific attributes such as Pod interface MAC address and VLAN ID to Spiderpool. +- Releasing the IaaS-side IP binding when Spiderpool releases the Pod IP. + +## How it works + +When the feature is enabled, Spiderpool performs the following calls: + +1. During Pod IP allocation, Spiderpool allocates IPs from Spiderpool IP pools first, then calls the IaaS Network Provider allocation API. +2. The IaaS Network Provider binds the IP on the cloud platform and returns the cloud-side network attributes. +3. Spiderpool writes the returned MAC address and VLAN ID into the allocation result, and the VLAN CNI pipeline uses them to configure the Pod interface. +4. During Pod IP release, Spiderpool calls the IaaS Network Provider release API for each IPv4 address that should be released. +5. After the IaaS release call returns successfully, Spiderpool releases the IP from the internal IP pool. "Success" here means the IaaS Network Provider has accepted the release request and started the cloud-side cleanup. It does **not** guarantee that the IaaS-side IP resource is fully released, because the cloud platform may still be processing due to rate limits or asynchronous cleanup. + +The IaaS Network Provider is an HTTP service. Spiderpool only defines the API contract and does not depend on a specific cloud vendor implementation. + +## Usage + +Configure the provider URL through Helm values: + +```yaml +ipam: + enableGatewayDetection: false + enableIPConflictDetection: false +plugins: + installVlanCNI: true +iaasNetworkProvider: + serverUrl: "http://iaas-network-provider.iaas-network-provider-system.svc:80" +``` + +- If `iaasNetworkProvider.serverUrl` is empty, Spiderpool does not call the IaaS Network Provider. +- `plugins.installVlanCNI` must also be enabled. +- `ipam.enableGatewayDetection` and `ipam.enableIPConflictDetection` must be disabled. This mode is different from the traditional approach of calling CNI first and then calling IPAM. In this mode, IPAM must be called first to obtain the IaaS IP information before calling CNI to complete the Pod network configuration. Therefore, gateway detection and IP conflict detection cannot work in this mode. + +> **Note**: [VLAN-CNI](https://github.com/spidernet-io/vlan-cni) is a VLAN CNI plugin developed by Spiderpool based on the upstream community cni-plugin project. It can be used to integrate with third-party cloud platform IaaS Network Providers, allocating IaaS-layer VLAN network interfaces for containers. + +The URL must include the scheme, host, and port. Spiderpool appends the fixed API paths to this base URL. + +### Verify the feature is enabled + +After installation, you can verify whether the feature is active by: + +1. **Check the ConfigMap** + + ```bash + kubectl get configmap spiderpool-conf -n -o yaml | grep iaasNetworkProvider + ``` + + If the output includes `iaasNetworkProvider.serverUrl` and the value is non-empty, the feature is enabled. + +2. **Check agent startup logs** + + ```bash + kubectl logs spiderpool-agent-xxx -n + ``` + + Search for `IaaS client created successfully` in the agent startup logs. If you see this log, the agent has successfully initialized the IaaS client and the feature is active. If you see `IaaS provider configuration validation failed`, there is a configuration issue; verify that the `serverUrl` format is correct. + +### Configure VLAN CNI + +When integrating with the IaaS Network Provider, you must use VLAN CNI to create VLAN sub-interfaces for Pods, and configure the VLAN ID and MAC address allocated by the cloud platform on those sub-interfaces. This ensures that the VLAN sub-interface configuration is consistent with the cloud platform, enabling normal network communication. + +If the VLAN ID is manually configured at this point, it will be inconsistent with the VLAN ID allocated by the cloud platform, leading to network communication anomalies. Therefore, **do not set `vlanID` in the `vlan` configuration of SpiderMultusConfig**; otherwise [vlan-cni](https://github.com/spidernet-io/vlan-cni) will be unable to create a correctly configured VLAN sub-interface for the Pod. + +> [vlan-cni](https://github.com/spidernet-io/vlan-cni) queries the local spiderpool-agent via a Unix socket during Pod creation to obtain the VLAN ID and MAC address allocated from the IaaS, and then creates the VLAN sub-interface in the Pod network namespace based on this information. + +In addition, platform administrators need to prepare the IaaS side in advance: + +- Create a VPC subnet and bind it to the elastic network interface. For example, bind the VPC subnet `172.91.0.0/24` to the network interface `enp0s28` on node `ECS-01`. + +Then create the corresponding SpiderMultusConfig and SpiderIPPool resources on the PaaS side: + +Example configuration: + +```yaml +apiVersion: spiderpool.spidernet.io/v2beta1 +kind: SpiderMultusConfig +metadata: + name: iaas-vlan-config + namespace: spiderpool +spec: + cniType: vlan + vlan: + master: + - enp0s28 + ippools: + ipv4: + - pool-enp0s28 +--- +apiVersion: spiderpool.spidernet.io/v2beta1 +kind: SpiderIPPool +metadata: + name: pool-enp0s28 +spec: + gateway: 172.91.0.1 + ips: + - 172.91.0.100-172.91.0.120 + subnet: 172.91.0.0/24 +``` + +- `master` is a required field. It must match the physical network interface name on the node, and the interface name must be consistent across all nodes. +- `subnet` is a required field. It must match the VPC subnet on the cloud platform. + +## API contract + +The provider must implement the following HTTP APIs. + +### Allocate IPs + +#### Request + +```text +POST /v1/apis/network.iaas.io/ipam/allocate-ips +Content-Type: application/json +``` + +Request body: + +```json +{ + "podName": "example-pod", + "podNamespace": "default", + "podUID": "9f8b7c6d-xxxx-xxxx-xxxx-xxxxxxxxxxxx", + "nodeName": "worker-1", + "iaasIPsAllocationRequest": [ + { + "ipAddress": "10.0.0.10", + "subnet": "10.0.0.0/24", + "parentNicMac": "fa:16:3e:11:22:33" + } + ] +} +``` + +Fields: + +| Field | Required | Description | +| --- | --- | --- | +| `podName` | No | Pod name. | +| `podNamespace` | No | Pod namespace. | +| `podUID` | No | Pod UID. | +| `nodeName` | Yes | Node where the Pod is scheduled. | +| `iaasIPsAllocationRequest` | Yes | IPs that Spiderpool has allocated and expects the provider to bind. | +| `ipAddress` | Yes | IP address without CIDR prefix. | +| `subnet` | Yes | Subnet CIDR of the IP. | +| `parentNicMac` | Yes | MAC address of the parent NIC that carries the Pod network. | + +#### Response + +Any HTTP `2xx` status code is treated as success. + +Response body: + +```json +{ + "podName": "example-pod", + "podNamespace": "default", + "nodeName": "worker-1", + "iaasIPsAllocationResponse": [ + { + "parentNicMac": "fa:16:3e:11:22:33", + "subnet": "10.0.0.0/24", + "ipAddress": "10.0.0.10", + "macAddress": "fa:16:3e:aa:bb:cc", + "vlanId": 100 + } + ] +} +``` + +Fields: + +| Field | Required | Description | +| --- | --- | --- | +| `iaasIPsAllocationResponse` | Yes | Allocation results returned by the provider. | +| `parentNicMac` | Yes | Parent NIC MAC used by the provider. | +| `subnet` | Yes | Subnet CIDR of the IP. | +| `ipAddress` | Yes | IP address that was bound by the provider. | +| `macAddress` | No | MAC address assigned by the cloud platform for the Pod interface. | +| `vlanId` | No | VLAN ID assigned by the cloud platform. | + +If `macAddress` or `vlanId` is empty, Spiderpool keeps the original allocation result for that field. + +### Release IP + +#### Request + +```text +POST /v1/apis/network.iaas.io/ipam/release-ip +Content-Type: application/json +``` + +Request body: + +```json +{ + "podName": "example-pod", + "podNamespace": "default", + "podUID": "9f8b7c6d-xxxx-xxxx-xxxx-xxxxxxxxxxxx", + "nodeName": "worker-1", + "parentNicMac": "fa:16:3e:11:22:33", + "subnet": "10.0.0.0/24", + "ipAddress": "10.0.0.10" +} +``` + +Fields: + +| Field | Required | Description | +| --- | --- | --- | +| `podName` | No | Pod name. | +| `podNamespace` | No | Pod namespace. | +| `podUID` | No | Pod UID. | +| `nodeName` | Yes | Node where the Pod was running. | +| `parentNicMac` | No | Parent NIC MAC. It may be empty in controller-side GC scenarios. | +| `subnet` | Yes | Subnet CIDR of the IP. | +| `ipAddress` | Yes | IP address to release. | + +#### Response + +The response body is ignored. Any HTTP `2xx` status code is treated as success. + +## Special scenario handling + +### Allocation must be synchronously successful + +Currently, Spiderpool only continues to update the IP status in SpiderIPPool and create or update the SpiderEndpoint object after the Provider has completed the IaaS-side IP binding and returned the network configuration normally. + +In some abnormal scenarios: + +- If the Provider or cloud platform throttles the API and the processing takes a long time, causing Spiderpool to time out while waiting for the HTTP response, Spiderpool will treat this allocation as failed. +- If the Provider side fails to respond, Spiderpool will wait for the timeout period and then treat this allocation as failed. + +If the spiderpool-agent does not receive a successful response from the Provider within the specified time (2 minutes), this allocation will be treated as a failure, and the Pod will be retried according to Kubernetes retry mechanisms. + +### Release should be idempotent + +The release API should be idempotent. If the IP has already been released or does not exist on the cloud platform, the provider should return a `2xx` status code when it is safe to consider the IP released. + +This avoids repeated CNI DEL or GC retries causing unnecessary failures. + +### Release may be eventually completed + +Some cloud platforms release IaaS IP resources slowly due to cloud-side rate limits or asynchronous cleanup mechanisms. Therefore, IP release may not be fully completed immediately after the provider receives the release request. + +Spiderpool requires the provider to accept the release request and start the cloud-side cleanup. The provider should return success when the release request is accepted or when the IP is already released. + +Spiderpool calls the IaaS release API before releasing the IP from Spiderpool's internal IP pool. This order avoids re-allocating an IP in Spiderpool before the cloud platform has accepted the release request. If the cloud platform completes the cleanup asynchronously after that, it does not block Spiderpool's IP release flow. + +### Parent NIC MAC lookup + +Spiderpool passes `parentNicMac` when it can determine the parent NIC MAC address. In agent-side allocation and release, Spiderpool can usually resolve the value from the runtime network environment or cache. + +In controller-side GC, Spiderpool may not run in the host network namespace of every node, so it may not be able to resolve the parent NIC MAC. In such cases, Spiderpool may send an empty `parentNicMac` during release. Provider implementations should tolerate this for the release API. + +## Abnormal scenario handling + +Spiderpool treats the following cases as failures: + +- HTTP request failure. +- Non-`2xx` HTTP response status. +- Invalid allocation response JSON. +- Allocation response containing unknown IPs. + +When release fails, Spiderpool may retry through later cleanup flows depending on where the release is triggered. Provider implementations should therefore make release operations safe to retry. diff --git a/docs/usage/readme-zh_CN.md b/docs/usage/readme-zh_CN.md index caed5953c..218a182ba 100644 --- a/docs/usage/readme-zh_CN.md +++ b/docs/usage/readme-zh_CN.md @@ -97,6 +97,8 @@ AI 集群通常使用多轨的 RDMA 网络为 GPU 提供通信。Spiderpool 可 - 合理的 IP 回收机制设计,使得集群或应用在故障恢复过程中,能够及时分配到 IP 地址。可参考[例子](../concepts/ipam-des-zh_CN.md)。 +- Spiderpool 支持对接 IaaS Network Provider,在 IP 分配和释放时绑定与释放云平台侧 IP 资源。可参考 [IaaS Network Provider 说明](./iaas-network-provider-zh_CN.md)。 + ### 多网卡功能 - 支持为 Pod 多网卡分配不同子网的 IP 地址;帮助所有网卡之间协调策略路由,以确保请求向和回复向数据路径一致,避免丢包;支持定制哪张网卡的网关作为缺省路由。 diff --git a/docs/usage/readme.md b/docs/usage/readme.md index 5528865f7..29d6ce0b0 100644 --- a/docs/usage/readme.md +++ b/docs/usage/readme.md @@ -96,6 +96,8 @@ For instructions on how to upgrade Spiderpool, please refer to the [upgrade guid - Well-designed IP reclamation mechanisms promptly allocate IP addresses during cluster or application recovery processes. Refer to the [example](../concepts/ipam-des.md) for details. +- Spiderpool can integrate with a generic IaaS Network Provider to bind and release cloud-side IP resources during IP allocation and release. Refer to the [IaaS Network Provider](./iaas-network-provider.md) guide for details. + ### Multiple Network Interfaces Features - Spiderpool offers the ability to assign IP addresses from different subnets to multiple network interfaces of a Pod. This feature ensures coordinated policy routing among all interfaces, guaranteeing consistent data paths for outgoing and incoming requests and mitigating packet loss. Moreover, it allows for customization of the default route using a specific network interface's gateway. diff --git a/images/spiderpool-plugins/version.sh b/images/spiderpool-plugins/version.sh index 15d963096..17a3cc052 100644 --- a/images/spiderpool-plugins/version.sh +++ b/images/spiderpool-plugins/version.sh @@ -18,4 +18,4 @@ export IB_SRIOV_VERSION=${IB_SRIOV_VERSION:-"v1.3.0"} # https://github.com/Mellanox/ipoib-cni export IPOIB_VERSION=${IPOIB_VERSION:-"v1.2.2"} # https://github.com/spidernet-io/vlan-cni -export VLAN_VERSION=${VLAN_VERSION:-"0.0.1"} +export VLAN_VERSION=${VLAN_VERSION:-"v0.0.1"} diff --git a/pkg/gcmanager/scanAll_IPPool.go b/pkg/gcmanager/scanAll_IPPool.go index 791f34b92..833540b2a 100644 --- a/pkg/gcmanager/scanAll_IPPool.go +++ b/pkg/gcmanager/scanAll_IPPool.go @@ -402,12 +402,13 @@ func (s *SpiderGC) executeScanAll(ctx context.Context) { if endpoint != nil { nodeName = endpoint.Status.Current.Node } - if releaseErr := s.iaasClient.ReleaseIPs(ctx, &iaasclient.ReleaseIPsRequest{ + if releaseErr := s.iaasClient.ReleaseIP(ctx, &iaasclient.ReleaseIPRequest{ PodName: podName, PodNamespace: podNS, PodUID: poolIPAllocation.PodUID, NodeName: nodeName, - IPAddresses: []string{poolIP}, + Subnet: pool.Spec.Subnet, + IPAddress: poolIP, }); releaseErr != nil { scanAllLogger.Sugar().Errorf("failed to release IaaS IP '%s', error: '%v'", poolIP, releaseErr) } else { diff --git a/pkg/gcmanager/tracePod_worker.go b/pkg/gcmanager/tracePod_worker.go index 5f86689e1..d68eca0d3 100644 --- a/pkg/gcmanager/tracePod_worker.go +++ b/pkg/gcmanager/tracePod_worker.go @@ -6,6 +6,7 @@ package gcmanager import ( "context" "fmt" + "net" "sync" "sync/atomic" "time" @@ -163,28 +164,30 @@ func (s *SpiderGC) releaseIPPoolIPExecutor(ctx context.Context, workerIndex int) // Release IPs from IaaS provider after releasing from internal IPPools if s.iaasClient != nil { - var ipAddresses []string for _, detail := range endpoint.Status.Current.IPs { if detail.IPv4 != nil { - ipAddresses = append(ipAddresses, *detail.IPv4) + ip, subnet, err := net.ParseCIDR(*detail.IPv4) + if err != nil { + log.Sugar().Errorf("failed to parse CIDR '%s', error: %v, skip releasing IaaS IP '%s'", *detail.IPv4, err, *detail.IPv4) + continue + } + req := &iaasclient.ReleaseIPRequest{ + PodName: podCache.PodName, + PodNamespace: podCache.Namespace, + PodUID: podCache.UID, + NodeName: endpoint.Status.Current.Node, + Subnet: subnet.String(), + IPAddress: ip.String(), + } + if err := s.iaasClient.ReleaseIP(ctx, req); err != nil { + log.Sugar().Errorf("failed to release IaaS IP '%s' for '%s/%s', error: %v", + ip.String(), podCache.Namespace, podCache.PodName, err) + return err + } + log.Sugar().Infof("successfully released IaaS IP '%s' for '%s/%s'", + ip.String(), podCache.Namespace, podCache.PodName) } } - if len(ipAddresses) > 0 { - req := &iaasclient.ReleaseIPsRequest{ - PodName: podCache.PodName, - PodNamespace: podCache.Namespace, - PodUID: podCache.UID, - NodeName: endpoint.Status.Current.Node, - IPAddresses: ipAddresses, - } - if err := s.iaasClient.ReleaseIPs(ctx, req); err != nil { - log.Sugar().Errorf("failed to release IaaS IPs for '%s/%s', error: %v", - podCache.Namespace, podCache.PodName, err) - return err - } - log.Sugar().Infof("successfully released IaaS IPs %v for '%s/%s'", - ipAddresses, podCache.Namespace, podCache.PodName) - } } // delete StatefulSet/kubevirtVMI wep (other controller wep has OwnerReference, its lifecycle is same with pod) diff --git a/pkg/iaas/client/client.go b/pkg/iaas/client/client.go index 00dacd04f..d35c740f8 100644 --- a/pkg/iaas/client/client.go +++ b/pkg/iaas/client/client.go @@ -10,7 +10,6 @@ import ( "encoding/json" "fmt" "io" - "net" "net/http" "net/url" "sync" @@ -26,25 +25,17 @@ const ( releaseAPIPath = "/v1/apis/network.iaas.io/ipam/release-ip" ) -// ParentNicMacLookupFunc is a fallback function to look up parentNicMac -// when the cache does not have the value. It receives the context and the IP CIDR string. -type ParentNicMacLookupFunc func(ctx context.Context, ipCIDR string) (string, error) - // Client is the interface for IaaS provider API client type Client interface { // AllocateIPs calls the IaaS provider to allocate IPs AllocateIPs(ctx context.Context, req *AllocateIPRequest) (*AllocateIPResponse, error) // ReleaseIPs calls the IaaS provider to release IPs - ReleaseIPs(ctx context.Context, req *ReleaseIPsRequest) error + ReleaseIP(ctx context.Context, req *ReleaseIPRequest) error // GetCachedParentNicMac returns the cached parent NIC MAC for the given key, - // or empty string if not cached. Key can be SpiderMultusConfig namespace/name - // or IP CIDR string. + // or empty string if not cached. Key is SpiderMultusConfig namespace/name. GetCachedParentNicMac(key string) (string, bool) // CacheParentNicMac stores a parent NIC MAC for the given key. CacheParentNicMac(key string, mac string) - // SetParentNicMacLookupFunc sets a fallback lookup function for parentNicMac - // when cache misses (e.g., after agent restart). - SetParentNicMacLookupFunc(fn ParentNicMacLookupFunc) } // IaaSClient implements the Client interface @@ -54,13 +45,8 @@ type IaaSClient struct { logger *zap.Logger // parentNicMacCache caches key -> parent NIC MAC address. - // Keys include both SpiderMultusConfig namespace/name and IP CIDR strings, - // so that release path can look up parentNicMac by IP. + // Keys use SpiderMultusConfig namespace/name. parentNicMacCache sync.Map - - // parentNicMacLookupFunc is a fallback function to look up parentNicMac - // when the cache does not have the value (e.g., after agent restart). - parentNicMacLookupFunc ParentNicMacLookupFunc } // ValidateConfig validates the IaaS provider configuration. @@ -177,16 +163,14 @@ func (c *IaaSClient) AllocateIPs(ctx context.Context, req *AllocateIPRequest) (* return &allocateResp, nil } -// ReleaseIPs calls the IaaS provider to release IPs. -// The provider only supports releasing one IP per request, so this method -// loops over each IP and calls the API individually. -func (c *IaaSClient) ReleaseIPs(ctx context.Context, req *ReleaseIPsRequest) error { +// ReleaseIP calls the IaaS provider to release an IP. +func (c *IaaSClient) ReleaseIP(ctx context.Context, req *ReleaseIPRequest) error { c.logger.Debug("Calling IaaS release API", zap.String("url", c.baseURL), zap.String("nodeName", req.NodeName), - zap.String("podName", req.PodName), - zap.String("podNamespace", req.PodNamespace), - zap.Strings("ipAddresses", req.IPAddresses), + zap.String("ipAddress", req.IPAddress), + zap.String("subnet", req.Subnet), + zap.String("parentNicMac", req.ParentNicMac), ) reqURL, err := url.JoinPath(c.baseURL, releaseAPIPath) @@ -194,45 +178,25 @@ func (c *IaaSClient) ReleaseIPs(ctx context.Context, req *ReleaseIPsRequest) err return fmt.Errorf("failed to construct release URL: %w", err) } - for _, ip := range req.IPAddresses { - c.logger.Debug("Releasing single IP via IaaS", zap.String("ip", ip)) - - ipstr, ipnet, err := net.ParseCIDR(ip) - if err != nil { - c.logger.Error("Failed to parse IP for release", zap.String("ip", ip), zap.Error(err)) - return fmt.Errorf("failed to parse IP %s: %w", ip, err) - } - - // Look up parentNicMac via lookup function (queries SMC-keyed cache or resolves from SpiderMultusConfig) - var parentNicMac string - if c.parentNicMacLookupFunc != nil { - mac, lookupErr := c.parentNicMacLookupFunc(ctx, ip) - if lookupErr != nil { - c.logger.Warn("Failed to lookup parentNicMac, proceeding with empty value", - zap.String("ip", ip), zap.Error(lookupErr)) - } else { - parentNicMac = mac - } - } else { - c.logger.Warn("No parentNicMac lookup function configured, proceeding with empty value", - zap.String("ip", ip)) - } - - singleReq := &ReleaseIPRequest{ - NodeName: req.NodeName, - IPAddress: ipstr.String(), - Subnet: ipnet.String(), - ParentNicMac: parentNicMac, - } - - if err := c.releaseSingleIP(ctx, reqURL, singleReq); err != nil { - return fmt.Errorf("failed to release IP %s: %w", ip, err) - } + singleReq := &ReleaseIPRequest{ + PodName: req.PodName, + PodNamespace: req.PodNamespace, + PodUID: req.PodUID, + NodeName: req.NodeName, + IPAddress: req.IPAddress, + Subnet: req.Subnet, + ParentNicMac: req.ParentNicMac, + } + + if err := c.releaseSingleIP(ctx, reqURL, singleReq); err != nil { + return fmt.Errorf("failed to release IP %s: %w", req.IPAddress, err) } c.logger.Info("IaaS release API succeeded", zap.String("nodeName", req.NodeName), - zap.Strings("ipAddresses", req.IPAddresses), + zap.String("ipAddress", req.IPAddress), + zap.String("subnet", req.Subnet), + zap.String("parentNicMac", req.ParentNicMac), ) return nil @@ -293,12 +257,6 @@ func (c *IaaSClient) CacheParentNicMac(key string, mac string) { c.parentNicMacCache.Store(key, mac) } -// SetParentNicMacLookupFunc sets a fallback lookup function for parentNicMac -// when cache misses (e.g., after agent restart). -func (c *IaaSClient) SetParentNicMacLookupFunc(fn ParentNicMacLookupFunc) { - c.parentNicMacLookupFunc = fn -} - // Close closes the IaaS client func (c *IaaSClient) Close() error { return nil diff --git a/pkg/iaas/client/types.go b/pkg/iaas/client/types.go index 8ef84b171..c06fac86a 100644 --- a/pkg/iaas/client/types.go +++ b/pkg/iaas/client/types.go @@ -54,7 +54,7 @@ type IaaSIPAllocationResult struct { } // ReleaseIPRequest represents the request body for IaaS IP release API -type ReleaseIPsRequest struct { +type ReleaseIPRequest struct { // PodName is optional PodName string `json:"podName,omitempty"` // PodNamespace is optional @@ -63,15 +63,10 @@ type ReleaseIPsRequest struct { PodUID string `json:"podUID,omitempty"` // NodeName is required NodeName string `json:"nodeName"` - // IPAddresses are the IPs being released - IPAddresses []string `json:"ipAddresses"` -} - -type ReleaseIPRequest struct { - // NodeName is required - NodeName string `json:"nodeName"` + // ParentNicMac is optional + ParentNicMac string `json:"parentNicMac,omitempty"` + // Subnet is required + Subnet string `json:"subnet"` // IPAddress is the IP being released - IPAddress string `json:"ipAddress"` - Subnet string `json:"subnet"` - ParentNicMac string `json:"parentNicMac"` + IPAddress string `json:"ipAddress"` } diff --git a/pkg/ipam/allocate.go b/pkg/ipam/allocate.go index fcd49a7f3..618a560d6 100644 --- a/pkg/ipam/allocate.go +++ b/pkg/ipam/allocate.go @@ -439,9 +439,10 @@ func (i *ipam) allocateInStandardMode(ctx context.Context, addArgs *models.IpamA if i.config.IaaSClient != nil { logger.Debug("Calling IaaS provider to allocate IPs", zap.String("nic", *addArgs.IfName)) if _, iaasErr := i.callIaaSAllocate(ctx, pod, results); iaasErr != nil { - logger.Error("IaaS allocate failed, continuing without IaaS allocation", zap.Error(iaasErr)) + logger.Error("IaaS allocate failed, aborting IPAM allocation", zap.Error(iaasErr)) return nil, fmt.Errorf("IaaS IP allocate failed: %w", iaasErr) } + logger.Debug("IaaS allocate succeeded") } logger.Debug("Group custom routes by IP allocation results") diff --git a/pkg/ipam/iaas.go b/pkg/ipam/iaas.go index 16fdffcaf..38ae9325a 100644 --- a/pkg/ipam/iaas.go +++ b/pkg/ipam/iaas.go @@ -54,7 +54,8 @@ func (i *ipam) callIaaSAllocate(ctx context.Context, pod *corev1.Pod, results [] logger.Error("Failed to parse IP address", zap.String("address", *result.IP.Address), zap.Error(err)) return nil, fmt.Errorf("failed to parse IP address: %w", err) } - parentMac, err := i.getParentNicMacFromMultus(ctx, pod, *result.IP.Nic) + subnet := ipNet.String() + parentMac, err := i.getParentNicMacFromMultus(ctx, pod, *result.IP.Nic, subnet) if err != nil { logger.Error("Failed to get parent NIC MAC", zap.String("nic", *result.IP.Nic), zap.Error(err)) return nil, fmt.Errorf("failed to get parent NIC MAC: %w", err) @@ -64,7 +65,7 @@ func (i *ipam) callIaaSAllocate(ctx context.Context, pod *corev1.Pod, results [] req.IaaSIPsAllocationRequest = append(req.IaaSIPsAllocationRequest, iaasclient.IaaSIPAllocationItem{ IPAddress: ipStr, - Subnet: ipNet.String(), + Subnet: subnet, ParentNicMac: parentMac, }) } @@ -107,8 +108,9 @@ func (i *ipam) callIaaSAllocate(ctx context.Context, pod *corev1.Pod, results [] return resp, nil } -// callIaaSRelease calls the IaaS provider API to release IPs -func (i *ipam) callIaaSRelease(ctx context.Context, endpoint *v2beta1.SpiderEndpoint) error { +// callIaaSRelease calls the IaaS provider API to release IPs for all IPv4 addresses in the endpoint. +// It releases each IP individually and aggregates any errors. +func (i *ipam) callIaaSRelease(ctx context.Context, nic string, endpoint *v2beta1.SpiderEndpoint) error { if i.config.IaaSClient == nil { return nil } @@ -118,58 +120,102 @@ func (i *ipam) callIaaSRelease(ctx context.Context, endpoint *v2beta1.SpiderEndp zap.String("namespace", endpoint.Namespace), ) - // Collect all IP addresses from the endpoint allocation details - var ipAddresses []string + var pod *corev1.Pod // lazy-loaded on first cache miss + var errs []error for _, detail := range endpoint.Status.Current.IPs { - // only ipv4 now - if detail.IPv4 != nil { - ipAddresses = append(ipAddresses, *detail.IPv4) + // Only handle IPv4 for now + if detail.IPv4 == nil { + continue } - } - if len(ipAddresses) == 0 { - logger.Debug("No IP addresses to release via IaaS") - return nil - } + ip, subnetCIDR, err := net.ParseCIDR(*detail.IPv4) + if err != nil { + logger.Error("failed to parse CIDR", zap.String("ip", *detail.IPv4), zap.Error(err)) + errs = append(errs, fmt.Errorf("failed to parse CIDR %s: %w", *detail.IPv4, err)) + continue + } + subnet := subnetCIDR.String() + ipStr := ip.String() - req := &iaasclient.ReleaseIPsRequest{ - PodName: endpoint.Name, - PodNamespace: endpoint.Namespace, - PodUID: endpoint.Status.Current.UID, - NodeName: endpoint.Status.Current.Node, - IPAddresses: ipAddresses, - } + // Fast path: try subnet cache first + var parentNicMac string + if cached, ok := i.config.IaaSClient.GetCachedParentNicMac(subnet); ok { + logger.Debug("parentNicMac cache hit by subnet", zap.String("subnet", subnet)) + parentNicMac = cached + } else { + // Get parentNicMac: try subnet cache first, then pod-based lookup + if pod == nil { + pod, err = i.podManager.GetPodByName(ctx, endpoint.Namespace, endpoint.Name, true) + if err != nil { + logger.Error("failed to get pod for IaaS release", zap.Error(err)) + errs = append(errs, fmt.Errorf("failed to get pod %s/%s: %w", endpoint.Namespace, endpoint.Name, err)) + continue + } + } + parentNicMac, err = i.getParentNicMacFromMultus(ctx, pod, nic, subnet) + if err != nil { + logger.Warn("Failed to get parentNicMac for IaaS release, proceeding with empty value", + zap.String("nic", detail.NIC), + zap.String("subnet", subnet), + zap.Error(err)) + } + } - logger.Debug("Calling IaaS release API", - zap.String("podUID", endpoint.Status.Current.UID), - zap.String("nodeName", endpoint.Status.Current.Node), - zap.Strings("ipAddresses", ipAddresses), - ) + req := &iaasclient.ReleaseIPRequest{ + PodName: endpoint.Name, + PodNamespace: endpoint.Namespace, + PodUID: endpoint.Status.Current.UID, + NodeName: endpoint.Status.Current.Node, + IPAddress: ipStr, + Subnet: subnet, + ParentNicMac: parentNicMac, + } - if err := i.config.IaaSClient.ReleaseIPs(ctx, req); err != nil { - logger.Error("IaaS release API failed", + logger.Debug("Calling IaaS release API", zap.String("podUID", endpoint.Status.Current.UID), - zap.Strings("ipAddresses", ipAddresses), - zap.Error(err), + zap.String("nodeName", endpoint.Status.Current.Node), + zap.String("ipAddress", ipStr), + zap.String("subnet", subnet), + zap.String("parentNicMac", parentNicMac), ) - return fmt.Errorf("iaas release failed: %w", err) + + if err := i.config.IaaSClient.ReleaseIP(ctx, req); err != nil { + logger.Error("IaaS release API failed", + zap.String("podUID", endpoint.Status.Current.UID), + zap.String("ipAddress", ipStr), + zap.String("subnet", subnet), + zap.Error(err), + ) + errs = append(errs, fmt.Errorf("failed to release IP %s: %w", ipStr, err)) + continue + } + + logger.Info("IaaS release API succeeded", zap.String("ipAddress", ipStr)) } - logger.Info("IaaS release API succeeded", zap.Strings("ipAddresses", ipAddresses)) + if len(errs) > 0 { + return fmt.Errorf("iaas release failed for %d IP(s): %v", len(errs), errs) + } return nil } // getParentNicMacFromMultus gets the parent NIC MAC address by: -// 1. Checking the in-memory cache first (keyed by SpiderMultusConfig namespace/name) +// 1. Checking the in-memory cache first using subnet as key // 2. If not cached: parsing pod's Multus annotation to find the NAD for the given NIC -// 3. Reading SpiderMultusConfig (same name as NAD) to get the master interface -// 4. Using netlink to get the master interface MAC on the host -// 5. Storing the result in cache for future lookups -func (i *ipam) getParentNicMacFromMultus(ctx context.Context, pod *corev1.Pod, nic string) (string, error) { +// 3. Checking the cache using SpiderMultusConfig namespace/name as key +// 4. Reading SpiderMultusConfig to get the master interface and resolving its MAC via netlink +// 5. Storing the result in cache keyed by both subnet and SpiderMultusConfig namespace/name +func (i *ipam) getParentNicMacFromMultus(ctx context.Context, pod *corev1.Pod, nic string, subnet string) (string, error) { if i.config.APIReader == nil { return "", fmt.Errorf("APIReader is not configured") } + if subnet != "" { + if cached, ok := i.config.IaaSClient.GetCachedParentNicMac(subnet); ok { + return cached, nil + } + } + // Step 1: find the NAD info for this NIC from Multus annotations netInfo, err := iaasutils.GetMultusNetworkForNIC(pod, nic, i.config.AgentNamespace, i.config.MultusClusterNetwork) if err != nil { @@ -179,6 +225,9 @@ func (i *ipam) getParentNicMacFromMultus(ctx context.Context, pod *corev1.Pod, n // Step 2: check IaaS client cache using SpiderMultusConfig namespace/name as key cacheKey := netInfo.Namespace + "/" + netInfo.Name if cached, ok := i.config.IaaSClient.GetCachedParentNicMac(cacheKey); ok { + if subnet != "" { + i.config.IaaSClient.CacheParentNicMac(subnet, cached) + } return cached, nil } @@ -203,14 +252,17 @@ func (i *ipam) getParentNicMacFromMultus(ctx context.Context, pod *corev1.Pod, n mac := link.Attrs().HardwareAddr.String() // Step 6: store in IaaS client cache for future lookups + if subnet != "" { + i.config.IaaSClient.CacheParentNicMac(subnet, mac) + } i.config.IaaSClient.CacheParentNicMac(cacheKey, mac) return mac, nil } // prewarmParentNicMacCache lists all vlan-type SpiderMultusConfigs at startup -// and resolves their master interface MAC addresses into the cache. -// This ensures the cache is populated before any allocate/release calls. +// and resolves their master interface MAC addresses into the cache keyed by +// SpiderMultusConfig namespace/name only. func (i *ipam) prewarmParentNicMacCache(ctx context.Context) { logger := logutils.FromContext(ctx) logger.Info("Prewarming parentNicMac cache from SpiderMultusConfigs") @@ -239,7 +291,7 @@ func (i *ipam) prewarmParentNicMacCache(ctx context.Context) { } cacheKey := smc.Namespace + "/" + smc.Name - // Skip if already cached + // Skip if already cached by SMC key if _, ok := i.config.IaaSClient.GetCachedParentNicMac(cacheKey); ok { continue } @@ -265,63 +317,6 @@ func (i *ipam) prewarmParentNicMacCache(ctx context.Context) { logger.Info("Finished prewarming parentNicMac cache", zap.Int("count", count)) } -// parentNicMacFallbackLookup is a fallback function for the IaaS client to look up -// parentNicMac when the cache does not have the value (e.g., after agent restart). -// It lists all SpiderMultusConfigs with vlan CNI type, gets their master interface -// names, and resolves the MAC address via netlink. -func (i *ipam) parentNicMacFallbackLookup(ctx context.Context, _ string) (string, error) { - logger := logutils.FromContext(ctx) - logger.Info("parentNicMac fallback lookup") - - if i.config.APIReader == nil { - return "", fmt.Errorf("APIReader is not configured") - } - - // List all SpiderMultusConfigs - smcList := &v2beta1.SpiderMultusConfigList{} - if err := i.config.APIReader.List(ctx, smcList); err != nil { - return "", fmt.Errorf("failed to list SpiderMultusConfigs: %w", err) - } - - // Find vlan type SMCs and resolve their master interface MAC - for idx := range smcList.Items { - smc := &smcList.Items[idx] - if smc.Spec.CniType == nil || *smc.Spec.CniType != constant.VlanCNI { - continue - } - - masterIface, err := getMasterIfaceFromMultusConfig(smc) - if err != nil { - // Skip non-vlan or misconfigured SMCs - continue - } - - // Check cache first using SMC namespace/name as key - cacheKey := smc.Namespace + "/" + smc.Name - if cached, ok := i.config.IaaSClient.GetCachedParentNicMac(cacheKey); ok { - return cached, nil - } - - // Get MAC address of the master interface via netlink - link, err := netlink.LinkByName(masterIface) - if err != nil { - logger.Warn("failed to get link for master interface", - zap.String("masterIface", masterIface), - zap.String("smc", cacheKey), - zap.Error(err)) - continue - } - - mac := link.Attrs().HardwareAddr.String() - - // Cache using SMC namespace/name as key - i.config.IaaSClient.CacheParentNicMac(cacheKey, mac) - return mac, nil - } - - return "", fmt.Errorf("no vlan-type SpiderMultusConfig found for parentNicMac lookup") -} - // getMasterIfaceFromMultusConfig extracts the first master interface name from a SpiderMultusConfig func getMasterIfaceFromMultusConfig(smc *v2beta1.SpiderMultusConfig) (string, error) { if smc.Spec.CniType == nil { diff --git a/pkg/ipam/ipam.go b/pkg/ipam/ipam.go index 3fbe747a6..47a7b5d05 100644 --- a/pkg/ipam/ipam.go +++ b/pkg/ipam/ipam.go @@ -94,12 +94,6 @@ func NewIPAM( kubevirtManager: kubevirtManager, } - // Register parentNicMac fallback lookup on IaaS client so that - // release can recover from cache miss (e.g., after agent restart). - if config.IaaSClient != nil { - config.IaaSClient.SetParentNicMacLookupFunc(i.parentNicMacFallbackLookup) - } - return i, nil } diff --git a/pkg/ipam/release.go b/pkg/ipam/release.go index d44ab1bea..6afb415b5 100644 --- a/pkg/ipam/release.go +++ b/pkg/ipam/release.go @@ -151,12 +151,14 @@ func (i *ipam) releaseForAllNICs(ctx context.Context, uid, nic string, endpoint logger.Sugar().Infof("Release IP allocation details: %v", allocation.IPs) - if err := i.release(ctx, allocation.UID, allocation.IPs); err != nil { + // Call IaaS provider to release IPs first (before releasing from internal IPPools) + // to avoid IP conflicts where the IP could be re-allocated before IaaS release completes. + if err := i.callIaaSRelease(ctx, nic, endpoint); err != nil { return err } - // Call IaaS provider to release IPs after releasing from internal IPPools - if err := i.callIaaSRelease(ctx, endpoint); err != nil { + // Release from internal IPPools after IaaS release succeeds + if err := i.release(ctx, allocation.UID, allocation.IPs); err != nil { return err } diff --git a/pkg/networking/networking/ipam_detection.go b/pkg/networking/networking/ipam_detection.go index ae2a0f65e..25bb3b33d 100644 --- a/pkg/networking/networking/ipam_detection.go +++ b/pkg/networking/networking/ipam_detection.go @@ -67,11 +67,31 @@ func DetectIPConflictAndGatewayReachable(logger *zap.Logger, iface string, hostN return nil } + // When IPAM is invoked, the NIC is down and must be set it up in order to detect IP conflicts and + // gateway reachability. + err := netns.Do(func(netNS ns.NetNS) error { + l, err := netlink.LinkByName(iface) + if err != nil { + return fmt.Errorf("failed to get link: %w", err) + } + + if err = netlink.LinkSetUp(l); err != nil { + return fmt.Errorf("failed to set link up: %w", err) + } + + logger.Sugar().Debugf("Set link %s to up for IP conflict and gateway detection", iface) + return nil + }) + if err != nil { + return fmt.Errorf("failed to set link up: %w", err) + } + errg := errgroup.Group{} - err := netns.Do(func(_ ns.NetNS) error { + err = netns.Do(func(_ ns.NetNS) error { for _, ipa := range dectectIPs { if ipa.Version == nil { - return nil + logger.Debug("IP version is nil, skip detection") + continue } ipaddress, _, err := net.ParseCIDR(*ipa.Address) if err != nil { diff --git a/test/scripts/install-kdoctor.sh b/test/scripts/install-kdoctor.sh index 7fd90eed4..d4857b708 100644 --- a/test/scripts/install-kdoctor.sh +++ b/test/scripts/install-kdoctor.sh @@ -53,7 +53,8 @@ case ${E2E_IP_FAMILY} in esac KDOCTOR_HELM_OPTIONS+=" --set kdoctorAgent.image.registry=${E2E_KDOCTOR_IMAGE_REPO} \ - --set kdoctorController.image.registry=${E2E_KDOCTOR_IMAGE_REPO} --set global.imageTagOverride=latest " + --set kdoctorController.image.registry=${E2E_KDOCTOR_IMAGE_REPO} \ + --set global.imageTagOverride=latest " echo "KDOCTOR_HELM_OPTIONS: ${KDOCTOR_HELM_OPTIONS}"