Skip to content

Commit 6cc3a9b

Browse files
committed
updated based on comments
1 parent 9529609 commit 6cc3a9b

10 files changed

Lines changed: 1524 additions & 188 deletions

coverage.out

Lines changed: 849 additions & 0 deletions
Large diffs are not rendered by default.

pkg/awsutils/awsutils.go

Lines changed: 66 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -237,7 +237,7 @@ type APIs interface {
237237
GetVpcSubnets(ctx context.Context) ([]ec2types.Subnet, error)
238238

239239
// IsSubnetExcluded returns if a subnet is excluded for pod IPs based on its tags
240-
IsSubnetExcluded(ctx context.Context, subnetID string) (bool, error)
240+
IsSubnetExcluded(ctx context.Context, subnetID string, isPrimarySubnet bool) (bool, error)
241241
}
242242

243243
// EC2InstanceMetadataCache caches instance metadata
@@ -299,6 +299,9 @@ type ENIMetadata struct {
299299

300300
// Network card the ENI is attached on
301301
NetworkCard int
302+
303+
// SubnetID the ENI is created from
304+
SubnetID string
302305
}
303306

304307
// PrimaryIPv4Address returns the primary IPv4 address of this node
@@ -557,7 +560,12 @@ func (cache *EC2InstanceMetadataCache) discoverCustomSecurityGroups(ctx context.
557560
},
558561
}
559562

560-
result, err := cache.ec2SVC.DescribeSecurityGroups(ctx, describeSGInput)
563+
var result *ec2.DescribeSecurityGroupsOutput
564+
err := retry.NWithBackoffCtx(ctx, retry.NewSimpleBackoff(time.Millisecond*100, time.Second*5, 0.15, 2.0), 5, func() error {
565+
var err error
566+
result, err = cache.ec2SVC.DescribeSecurityGroups(ctx, describeSGInput)
567+
return err
568+
})
561569
if err != nil {
562570
return nil, fmt.Errorf("discoverCustomSecurityGroups: unable to describe security groups: %v", err)
563571
}
@@ -588,14 +596,8 @@ func (cache *EC2InstanceMetadataCache) GetENISubnetID(ctx context.Context, eniID
588596
return *result.NetworkInterfaces[0].SubnetId, nil
589597
}
590598

591-
// Helper function to check if an ENI is in a secondary subnet
592-
func (cache *EC2InstanceMetadataCache) isENIInSecondarySubnet(ctx context.Context, eniID string) bool {
593-
eniSubnetID, err := cache.GetENISubnetID(ctx, eniID)
594-
return err == nil && eniSubnetID != cache.subnetID
595-
}
596-
597599
// Helper function to get ENIs that match specific criteria
598-
func (cache *EC2InstanceMetadataCache) getFilteredENIs(ctx context.Context, store *datastore.DataStore, onlySecondarySubnets bool) []string {
600+
func (cache *EC2InstanceMetadataCache) getFilteredENIs(store *datastore.DataStore, onlySecondarySubnets bool) []string {
599601
eniInfos := store.GetENIInfos()
600602
var eniIDs []string
601603

@@ -606,7 +608,7 @@ func (cache *EC2InstanceMetadataCache) getFilteredENIs(ctx context.Context, stor
606608
continue
607609
}
608610

609-
isSecondarySubnet := cache.isENIInSecondarySubnet(ctx, eniID)
611+
isSecondarySubnet := eniInfo.SubnetID != cache.subnetID
610612

611613
// Filter based on subnet type
612614
if onlySecondarySubnets != isSecondarySubnet {
@@ -681,12 +683,10 @@ func (cache *EC2InstanceMetadataCache) RefreshCustomSGIDs(ctx context.Context, d
681683
if err != nil {
682684
awsAPIErrInc("DiscoverCustomSecurityGroups", err)
683685
log.Warnf("Failed to discover custom security groups: %v. Falling back to using primary security groups for ENIs in secondary subnets", err)
684-
685-
// Clear custom security groups cache to trigger fallback behavior
686-
cache.customSecurityGroups.Set([]string{})
687-
688-
// Apply primary security groups to ENIs in secondary subnets as fallback
689-
cache.applyFallbackSecurityGroupsForAllDatastores(ctx, dsAccess)
686+
if eventRecorder := eventrecorder.Get(); eventRecorder != nil {
687+
eventRecorder.SendPodEvent(v1.EventTypeWarning, "FailedCustomSecurityGroupsDiscovery", "DescribeSecurityGroups",
688+
"aws-node failed calling ec2 api to discover custmized security groups for network interfaces from secondary subnets")
689+
}
690690
return err
691691
}
692692

@@ -698,7 +698,7 @@ func (cache *EC2InstanceMetadataCache) RefreshCustomSGIDs(ctx context.Context, d
698698
cache.customSecurityGroups.Set([]string{})
699699

700700
// Apply primary security groups to ENIs in secondary subnets as fallback
701-
cache.applyFallbackSecurityGroupsForAllDatastores(ctx, dsAccess)
701+
cache.applyPrimarySGsToSecondarySubnetENIs(ctx, dsAccess)
702702

703703
return nil
704704
}
@@ -710,16 +710,16 @@ func (cache *EC2InstanceMetadataCache) RefreshCustomSGIDs(ctx context.Context, d
710710
if addedCount != 0 || deletedCount != 0 {
711711
var eniIDs []string
712712
for _, ds := range dsAccess.DataStores {
713-
eniIDs = append(eniIDs, cache.getFilteredENIs(ctx, ds, true)...) // only secondary subnet ENIs
713+
eniIDs = append(eniIDs, cache.getFilteredENIs(ds, true)...) // only secondary subnet ENIs
714714
}
715715
cache.applySecurityGroupsToENIs(ctx, eniIDs, sgIDs, "Update")
716716
}
717717

718718
return nil
719719
}
720720

721-
// applyFallbackSecurityGroupsForAllDatastores applies primary security groups to ENIs in secondary subnets across all datastores
722-
func (cache *EC2InstanceMetadataCache) applyFallbackSecurityGroupsForAllDatastores(ctx context.Context, dsAccess *datastore.DataStoreAccess) {
721+
// applyPrimarySGsToSecondarySubnetENIs applies primary security groups to ENIs in secondary subnets across all datastores
722+
func (cache *EC2InstanceMetadataCache) applyPrimarySGsToSecondarySubnetENIs(ctx context.Context, dsAccess *datastore.DataStoreAccess) {
723723
log.Info("Applying primary security groups as fallback for ENIs in secondary subnets across all datastores")
724724

725725
primarySGs := cache.securityGroups.SortedList()
@@ -729,7 +729,7 @@ func (cache *EC2InstanceMetadataCache) applyFallbackSecurityGroupsForAllDatastor
729729

730730
var eniIDs []string
731731
for _, ds := range dsAccess.DataStores {
732-
eniIDs = append(eniIDs, cache.getFilteredENIs(ctx, ds, true)...) // only secondary subnet ENIs
732+
eniIDs = append(eniIDs, cache.getFilteredENIs(ds, true)...) // only secondary subnet ENIs
733733
}
734734

735735
cache.applySecurityGroupsToENIs(ctx, eniIDs, primarySGs, "Applying primary security groups to")
@@ -753,7 +753,7 @@ func (cache *EC2InstanceMetadataCache) RefreshSGIDs(ctx context.Context, mac str
753753
if cache.useSubnetDiscovery {
754754
for _, ds := range dsAccess.DataStores {
755755
// Get only primary subnet ENIs (onlySecondarySubnets=false)
756-
primarySubnetENIs := cache.getFilteredENIs(ctx, ds, false)
756+
primarySubnetENIs := cache.getFilteredENIs(ds, false)
757757
for _, eniID := range primarySubnetENIs {
758758
// Filter out unmanaged ENIs
759759
if !cache.unmanagedENIs.Has(eniID) {
@@ -878,6 +878,12 @@ func (cache *EC2InstanceMetadataCache) getENIMetadata(eniMAC string) (ENIMetadat
878878
}
879879
}
880880

881+
subnetID, err := cache.imds.GetSubnetID(ctx, eniMAC)
882+
if err != nil {
883+
awsAPIErrInc("GetSubnetID", err)
884+
return ENIMetadata{}, err
885+
}
886+
881887
if !ipv4Available && !ipv6Available {
882888
return ENIMetadata{
883889
ENIID: eniID,
@@ -890,6 +896,7 @@ func (cache *EC2InstanceMetadataCache) getENIMetadata(eniMAC string) (ENIMetadat
890896
IPv6Addresses: make([]ec2types.NetworkInterfaceIpv6Address, 0),
891897
IPv6Prefixes: make([]ec2types.Ipv6PrefixSpecification, 0),
892898
NetworkCard: networkCard,
899+
SubnetID: subnetID,
893900
}, nil
894901
}
895902

@@ -993,6 +1000,7 @@ func (cache *EC2InstanceMetadataCache) getENIMetadata(eniMAC string) (ENIMetadat
9931000
IPv6Addresses: ec2ip6s,
9941001
IPv6Prefixes: ec2ipv6Prefixes,
9951002
NetworkCard: networkCard,
1003+
SubnetID: subnetID,
9961004
}, nil
9971005
}
9981006

@@ -1198,9 +1206,9 @@ func (cache *EC2InstanceMetadataCache) createENI(ctx context.Context, sg []*stri
11981206
log.Info("Defaulting to same subnet as the primary interface for the new ENI")
11991207

12001208
// Even in fallback, check if primary subnet is excluded
1201-
excluded, checkErr := cache.IsSubnetExcluded(ctx, cache.subnetID)
1209+
excluded, checkErr := cache.IsSubnetExcluded(ctx, cache.subnetID, true)
12021210
if checkErr != nil {
1203-
log.Warnf("Failed to check if primary subnet is excluded: %v. Proceeding with ENI creation attempt.", checkErr)
1211+
return "", fmt.Errorf("Failed to check if primary subnet is excluded: %w. Quit ENI creation attempt.", checkErr)
12041212
} else if excluded {
12051213
// Primary subnet is explicitly excluded
12061214
return "", fmt.Errorf("primary subnet is tagged with kubernetes.io/role/cni=0 - no valid subnets available for ENI creation")
@@ -1215,18 +1223,21 @@ func (cache *EC2InstanceMetadataCache) createENI(ctx context.Context, sg []*stri
12151223
for _, subnet := range subnetResult {
12161224
// Check tag for all subnets including primary
12171225
isPrimarySubnet := *subnet.SubnetId == cache.subnetID
1218-
if !validTag(subnet, isPrimarySubnet) {
1226+
if !isSubnetValidForENICreation(subnet, isPrimarySubnet) {
12191227
// Log when primary subnet is excluded
12201228
if isPrimarySubnet {
12211229
log.Infof("Primary subnet %s is excluded from ENI creation", cache.subnetID)
12221230
}
12231231
continue
12241232
}
12251233
validSubnetsFound = true
1234+
// preset security groups for ENI with primary SGs
1235+
input.Groups = cache.securityGroups.SortedList()
12261236
// If this is a secondary subnet and we have custom security groups, use those instead
12271237
// We already determined isPrimarySubnet above, just reuse the variable
12281238
if !isPrimarySubnet && len(cache.customSecurityGroups.SortedList()) > 0 {
12291239
log.Infof("Using custom security groups for ENI in secondary subnet %s", *subnet.SubnetId)
1240+
// overring SGs if using secondary subnets and sgs
12301241
input.Groups = cache.customSecurityGroups.SortedList()
12311242
} else if !isPrimarySubnet {
12321243
// Secondary subnet but no custom security groups available - use primary SGs as fallback
@@ -1249,7 +1260,7 @@ func (cache *EC2InstanceMetadataCache) createENI(ctx context.Context, sg []*stri
12491260
} else {
12501261
log.Info("Using same security group config as the primary interface for the new ENI")
12511262
// When subnet discovery is disabled, check if primary subnet is excluded
1252-
excluded, checkErr := cache.IsSubnetExcluded(ctx, cache.subnetID)
1263+
excluded, checkErr := cache.IsSubnetExcluded(ctx, cache.subnetID, true)
12531264
if checkErr != nil {
12541265
// If we can't determine exclusion status, log warning and proceed
12551266
log.Warnf("Failed to check if primary subnet is excluded: %v. Proceeding with ENI creation attempt.", checkErr)
@@ -1300,18 +1311,13 @@ func (cache *EC2InstanceMetadataCache) GetVpcSubnets(ctx context.Context) ([]ec2
13001311
return subnetResult.Subnets, nil
13011312
}
13021313

1303-
// validTag checks if subnet should be used for ENI/IP allocation
1314+
// isSubnetValidForENICreation checks if subnet should be used for ENI/IP allocation
13041315
// For primary subnet: include by default (no tag), exclude only if tag value is "0"
13051316
// For secondary subnets: exclude by default (no tag), include only if tag exists with non-"0" value
13061317
// If the subnet has cluster-specific tags, it will only be used by the matching cluster
1307-
func validTag(subnet ec2types.Subnet, isPrimarySubnet bool) bool {
1308-
// Get cluster name for cluster-specific tag checks
1309-
localClusterName := os.Getenv(clusterNameEnvVar)
1310-
localClusterTagKey := clusterTagKeyPrefix + localClusterName
1311-
1318+
func isSubnetValidForENICreation(subnet ec2types.Subnet, isPrimarySubnet bool) bool {
13121319
// Parse subnet tags
13131320
cniTagValue := getTagValue(subnet.Tags, subnetDiscoveryTagKey)
1314-
hasClusterTags, belongsToThisCluster := checkClusterTags(subnet.Tags, localClusterTagKey)
13151321

13161322
// Rule 1: CNI tag with value "0" always excludes the subnet
13171323
if cniTagValue == subnetDiscoveryTagValueExcluded {
@@ -1333,7 +1339,7 @@ func validTag(subnet ec2types.Subnet, isPrimarySubnet bool) bool {
13331339
}
13341340

13351341
// Rule 3: Check cluster-specific tags
1336-
if !hasClusterTags || belongsToThisCluster {
1342+
if ValidSubnetForCluster(subnet, isPrimarySubnet) {
13371343
return true
13381344
}
13391345

@@ -1342,6 +1348,24 @@ func validTag(subnet ec2types.Subnet, isPrimarySubnet bool) bool {
13421348
return false
13431349
}
13441350

1351+
// ValidSubnetForCluster checks if a subnet is valid for use by this cluster
1352+
// For primary subnets, they are always valid for any cluster
1353+
// For secondary subnets, they must either have no cluster tags or have a matching cluster tag
1354+
func ValidSubnetForCluster(subnet ec2types.Subnet, isPrimarySubnet bool) bool {
1355+
if isPrimarySubnet {
1356+
// Primary subnets are always valid for any cluster
1357+
return true
1358+
}
1359+
// Get cluster name for cluster-specific tag checks
1360+
localClusterName := os.Getenv(clusterNameEnvVar)
1361+
localClusterTagKey := clusterTagKeyPrefix + localClusterName
1362+
hasClusterTags, belongsToThisCluster := checkClusterTags(subnet.Tags, localClusterTagKey)
1363+
if !hasClusterTags || belongsToThisCluster {
1364+
return true
1365+
}
1366+
return false
1367+
}
1368+
13451369
// getTagValue returns the value of a specific tag key, or empty string if not found
13461370
func getTagValue(tags []ec2types.Tag, key string) string {
13471371
for _, tag := range tags {
@@ -2603,7 +2627,7 @@ func checkAPIErrorAndBroadcastEvent(err error, api string) {
26032627
}
26042628

26052629
// IsSubnetExcluded checks if a subnet is excluded by examining its kubernetes.io/role/cni tag
2606-
func (cache *EC2InstanceMetadataCache) IsSubnetExcluded(ctx context.Context, subnetID string) (bool, error) {
2630+
func (cache *EC2InstanceMetadataCache) IsSubnetExcluded(ctx context.Context, subnetID string, isPrimarySubnet bool) (bool, error) {
26072631
// Get all VPC subnets with their tags
26082632
subnets, err := cache.GetVpcSubnets(ctx)
26092633
if err != nil {
@@ -2613,13 +2637,17 @@ func (cache *EC2InstanceMetadataCache) IsSubnetExcluded(ctx context.Context, sub
26132637
// Find the specific subnet and check its tags
26142638
for _, subnet := range subnets {
26152639
if *subnet.SubnetId == subnetID {
2640+
if !ValidSubnetForCluster(subnet, isPrimarySubnet) {
2641+
log.Debugf("IsSubnetExcluded: subnet %s doesn't have valid cluster tag", subnetID)
2642+
return true, nil
2643+
}
26162644
// Check if the subnet has the exclusion tag kubernetes.io/role/cni=0
26172645
for _, tag := range subnet.Tags {
26182646
if *tag.Key == "kubernetes.io/role/cni" {
26192647
tagValue := *tag.Value
2620-
excluded := tagValue == "0"
2621-
log.Debugf("IsSubnetExcluded: subnet %s has tag kubernetes.io/role/cni=%s, excluded=%t", subnetID, tagValue, excluded)
2622-
return excluded, nil
2648+
tagExcluded := tagValue == "0"
2649+
log.Debugf("IsSubnetExcluded: subnet %s has tag kubernetes.io/role/cni=%s, excluded=%t", subnetID, tagValue, tagExcluded)
2650+
return tagExcluded, nil
26232651
}
26242652
}
26252653

0 commit comments

Comments
 (0)