Skip to content

Commit f608e7a

Browse files
authored
Merge pull request #1105 from mmerkes/release-1.30
1.30: Requires node topology labels to be set for known supported instance …
2 parents ceef890 + 4f26082 commit f608e7a

File tree

8 files changed

+180
-30
lines changed

8 files changed

+180
-30
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,4 @@ site/
1010
e2e.test
1111
.idea/
1212
**/*.swp
13+
.DS_Store

pkg/providers/v1/aws.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1064,7 +1064,7 @@ func newAWSCloud2(cfg config.CloudConfig, awsServices Services, provider config.
10641064
}
10651065
awsCloud.instanceCache.cloud = awsCloud
10661066
awsCloud.zoneCache.cloud = awsCloud
1067-
awsCloud.instanceTopologyManager = resourcemanagers.NewInstanceTopologyManager(ec2v2)
1067+
awsCloud.instanceTopologyManager = resourcemanagers.NewInstanceTopologyManager(ec2v2, &cfg)
10681068

10691069
tagged := cfg.Global.KubernetesClusterTag != "" || cfg.Global.KubernetesClusterID != ""
10701070
if cfg.Global.VPC != "" && (cfg.Global.SubnetID != "" || cfg.Global.RoleARN != "") && tagged {

pkg/providers/v1/config/config.go

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,10 @@ package config
22

33
import (
44
"fmt"
5-
"github.com/aws/aws-sdk-go/aws/request"
65
"strings"
76

7+
"github.com/aws/aws-sdk-go/aws/request"
8+
89
"github.com/aws/aws-sdk-go/aws/endpoints"
910

1011
"k8s.io/klog/v2"
@@ -62,6 +63,14 @@ type CloudConfig struct {
6263

6364
// NodeIPFamilies determines which IP addresses are added to node objects and their ordering.
6465
NodeIPFamilies []string
66+
67+
// Override to regex validating whether or not instance types require instance topology
68+
// to get a definitive response. This will impact whether or not the node controller will
69+
// block on getting instance topology information for nodes.
70+
// See pkg/resourcemanagers/topology.go for more details.
71+
//
72+
// WARNING: Updating the default behavior and corresponding unit tests would be a much safer option.
73+
SupportedTopologyInstanceTypePattern string `json:"supportedTopologyInstanceTypePattern,omitempty" yaml:"supportedTopologyInstanceTypePattern,omitempty"`
6574
}
6675
// [ServiceOverride "1"]
6776
// Service = s3

pkg/providers/v1/instances_v2.go

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -83,17 +83,25 @@ func (c *Cloud) getAdditionalLabels(ctx context.Context, zoneName string, instan
8383
// If topology labels are already set, skip.
8484
if _, ok := existingLabels[LabelNetworkNodePrefix+"1"]; !ok {
8585
nodeTopology, err := c.instanceTopologyManager.GetNodeTopology(ctx, instanceType, region, instanceID)
86-
// We've seen some edge cases where this functionality is problematic, so swallowing errors and logging
87-
// to avoid short-circuiting syncing nodes. If it's an intermittent issue, the labels will be added
88-
// on subsequent attempts.
86+
8987
if err != nil {
90-
klog.Warningf("Failed to get node topology. Moving on without setting labels: %q", err)
88+
if c.instanceTopologyManager.DoesInstanceTypeRequireResponse(instanceType) {
89+
klog.Errorf("Failed to get node topology for instance type %s and one is expected %v.", instanceType, err)
90+
return nil, err
91+
}
92+
93+
// We don't expect that there will be a response for these instance types anyway,
94+
// so we're going to move on without setting the labels.
95+
klog.Warningf("Failed to get node topology for instance type %s and ID %s. Moving on without setting labels. Ignoring %v",
96+
instanceType, instanceID, err)
9197
} else if nodeTopology != nil {
9298
for index, networkNode := range nodeTopology.NetworkNodes {
9399
layer := index + 1
94100
label := LabelNetworkNodePrefix + strconv.Itoa(layer)
95101
additionalLabels[label] = networkNode
96102
}
103+
} else {
104+
klog.Infof("No instance topolopy for instance type %s available.", instanceType)
97105
}
98106
}
99107

pkg/providers/v1/instances_v2_test.go

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -238,13 +238,14 @@ func TestInstanceMetadata(t *testing.T) {
238238
assert.Equal(t, map[string]string{}, result.AdditionalLabels)
239239
})
240240

241-
t.Run("Should swallow errors if getting node topology fails", func(t *testing.T) {
241+
t.Run("Should swallow errors if getting node topology fails if instance type not expected to be supported", func(t *testing.T) {
242242
instance := makeInstance("i-00000000000000000", "192.168.0.1", "1.2.3.4", "instance-same.ec2.internal", "instance-same.ec2.external", nil, true)
243243
c, _ := mockInstancesResp(&instance, []*ec2.Instance{&instance})
244244
var mockedTopologyManager resourcemanagers.MockedInstanceTopologyManager
245245
c.instanceTopologyManager = &mockedTopologyManager
246246
mockedTopologyManager.On("GetNodeTopology", mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(nil,
247247
services.NewMockAPIError("InvalidParameterValue", "Nope."))
248+
mockedTopologyManager.On("DoesInstanceTypeRequireResponse", mock.Anything).Return(false)
248249
node := &v1.Node{
249250
Spec: v1.NodeSpec{
250251
ProviderID: fmt.Sprintf("aws:///us-west-2c/1abc-2def/%s", *instance.InstanceId),
@@ -261,6 +262,28 @@ func TestInstanceMetadata(t *testing.T) {
261262
LabelZoneID: "az1",
262263
}, result.AdditionalLabels)
263264
})
265+
266+
t.Run("Should not swallow errors if getting node topology fails if instance type is expected to be supported", func(t *testing.T) {
267+
instance := makeInstance("i-00000000000000000", "192.168.0.1", "1.2.3.4", "instance-same.ec2.internal", "instance-same.ec2.external", nil, true)
268+
c, _ := mockInstancesResp(&instance, []*ec2.Instance{&instance})
269+
var mockedTopologyManager resourcemanagers.MockedInstanceTopologyManager
270+
c.instanceTopologyManager = &mockedTopologyManager
271+
mockedTopologyManager.On("GetNodeTopology", mock.Anything, mock.Anything, mock.Anything, mock.Anything).Return(nil,
272+
services.NewMockAPIError("InvalidParameterValue", "Nope."))
273+
mockedTopologyManager.On("DoesInstanceTypeRequireResponse", mock.Anything).Return(true)
274+
node := &v1.Node{
275+
Spec: v1.NodeSpec{
276+
ProviderID: fmt.Sprintf("aws:///us-west-2c/1abc-2def/%s", *instance.InstanceId),
277+
},
278+
}
279+
280+
_, err := c.InstanceMetadata(context.TODO(), node)
281+
if err == nil {
282+
t.Error("Should error getting InstanceMetadata but succeeded.")
283+
}
284+
285+
mockedTopologyManager.AssertNumberOfCalls(t, "GetNodeTopology", 1)
286+
})
264287
}
265288

266289
func getCloudWithMockedDescribeInstances(instanceExists bool, instanceState string) *Cloud {

pkg/resourcemanagers/topology.go

Lines changed: 54 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -20,18 +20,33 @@ import (
2020
"context"
2121
"errors"
2222
"fmt"
23+
"regexp"
2324
"time"
2425

2526
"github.com/aws/aws-sdk-go-v2/service/ec2"
2627
"github.com/aws/aws-sdk-go-v2/service/ec2/types"
2728
"github.com/aws/smithy-go"
2829
"k8s.io/client-go/tools/cache"
30+
"k8s.io/cloud-provider-aws/pkg/providers/v1/config"
2931
"k8s.io/cloud-provider-aws/pkg/services"
3032
"k8s.io/klog/v2"
3133
)
3234

3335
const instanceTopologyManagerCacheTimeout = 24 * time.Hour
3436

37+
/*
38+
We need to ensure that instance types that we expect a response will not successfully complete syncing unless
39+
we get a response, so we can track known instance types that we expect to get a response for.
40+
41+
Supported instance types for DescribeInstanceTopology as of 2/6/25 from API documentation:
42+
https://docs.aws.amazon.com/AWSEC2/latest/APIReference/API_DescribeInstanceTopology.html
43+
44+
hpc6a.48xlarge | hpc6id.32xlarge | hpc7a.12xlarge | hpc7a.24xlarge | hpc7a.48xlarge | hpc7a.96xlarge | hpc7g.4xlarge | hpc7g.8xlarge | hpc7g.16xlarge
45+
p3dn.24xlarge | p4d.24xlarge | p4de.24xlarge | p5.48xlarge | p5e.48xlarge | p5en.48xlarge
46+
trn1.2xlarge | trn1.32xlarge | trn1n.32xlarge | trn2.48xlarge | trn2u.48xlarge
47+
*/
48+
var defaultSupportedTopologyInstanceTypePattern = regexp.MustCompile(`^(hpc|trn|p|inf)[0-9]+[a-z]*(\.[0-9a-z]*)$`)
49+
3550
// stringKeyFunc is a string as cache key function
3651
func topStringKeyFunc(obj interface{}) (string, error) {
3752
// Type should already be a string, so just return as is.
@@ -46,26 +61,36 @@ func topStringKeyFunc(obj interface{}) (string, error) {
4661
// InstanceTopologyManager enables mocking the InstanceTopologyManager.
4762
type InstanceTopologyManager interface {
4863
GetNodeTopology(ctx context.Context, instanceType string, region string, instanceID string) (*types.InstanceTopology, error)
64+
DoesInstanceTypeRequireResponse(instanceType string) bool
4965
}
5066

5167
// instanceTopologyManager manages getting instance topology for nodes.
5268
type instanceTopologyManager struct {
53-
ec2 services.Ec2SdkV2
54-
unsupportedKeyStore cache.Store
69+
ec2 services.Ec2SdkV2
70+
unsupportedKeyStore cache.Store
71+
supportedTopologyInstanceTypePattern *regexp.Regexp
5572
}
5673

5774
// NewInstanceTopologyManager generates a new InstanceTopologyManager.
58-
func NewInstanceTopologyManager(ec2 services.Ec2SdkV2) InstanceTopologyManager {
75+
func NewInstanceTopologyManager(ec2 services.Ec2SdkV2, cfg *config.CloudConfig) InstanceTopologyManager {
76+
var supportedTopologyInstanceTypePattern *regexp.Regexp
77+
if cfg.Global.SupportedTopologyInstanceTypePattern != "" {
78+
supportedTopologyInstanceTypePattern = regexp.MustCompile(cfg.Global.SupportedTopologyInstanceTypePattern)
79+
} else {
80+
supportedTopologyInstanceTypePattern = defaultSupportedTopologyInstanceTypePattern
81+
}
82+
5983
return &instanceTopologyManager{
60-
ec2: ec2,
84+
ec2: ec2,
85+
supportedTopologyInstanceTypePattern: supportedTopologyInstanceTypePattern,
6186
// These should change very infrequently, if ever, so checking once a day sounds fair.
6287
unsupportedKeyStore: cache.NewTTLStore(topStringKeyFunc, instanceTopologyManagerCacheTimeout),
6388
}
6489
}
6590

6691
// GetNodeTopology gets the instance topology for a node.
6792
func (t *instanceTopologyManager) GetNodeTopology(ctx context.Context, instanceType string, region string, instanceID string) (*types.InstanceTopology, error) {
68-
if t.mightSupportTopology(instanceType, region) {
93+
if t.mightSupportTopology(instanceID, instanceType, region) {
6994
request := &ec2.DescribeInstanceTopologyInput{InstanceIds: []string{instanceID}}
7095
topologies, err := t.ec2.DescribeInstanceTopology(ctx, request)
7196
if err != nil {
@@ -85,19 +110,26 @@ func (t *instanceTopologyManager) GetNodeTopology(ctx context.Context, instanceT
85110
t.addUnsupported(region)
86111
return nil, nil
87112
case "RequestLimitExceeded":
88-
// Gracefully handle request throttling
89113
klog.Warningf("Exceeded ec2:DescribeInstanceTopology request limits. Try again later: %q", err)
90-
return nil, nil
114+
return nil, err
91115
}
92116
}
93117

94118
// Unhandled error
95119
klog.Errorf("Error describing instance topology: %q", err)
96120
return nil, err
97121
} else if len(topologies) == 0 {
98-
// If no topology is returned, track the instance type as unsupported
99-
klog.Infof("Instance type %s unsupported for getting instance topology", instanceType)
100-
t.addUnsupported(instanceType)
122+
// If no topology is returned, track the instance type as unsupported if we don't require a response.
123+
if t.DoesInstanceTypeRequireResponse(instanceType) {
124+
// While the instance type could be unsupported, it's also possible that the instance is deleting or shut down
125+
// and has no active instance topology. In this case, we don't want to track it as unsupported.
126+
klog.Warningf("Instance %s of type %s has no instance topology listed but may be a supported type.", instanceID, instanceType)
127+
// Track that the instance ID is does not include a response. This will prevent us from calling again unnecessarily.
128+
t.addUnsupported(instanceID)
129+
} else {
130+
klog.Infof("Instance type %s unsupported for getting instance topology", instanceType)
131+
t.addUnsupported(instanceType)
132+
}
101133
return nil, nil
102134
}
103135

@@ -106,14 +138,19 @@ func (t *instanceTopologyManager) GetNodeTopology(ctx context.Context, instanceT
106138
return nil, nil
107139
}
108140

141+
// DoesInstanceTypeRequireResponse verifies whether or not we expect an instance to have an instance topology response.
142+
func (t *instanceTopologyManager) DoesInstanceTypeRequireResponse(instanceType string) bool {
143+
return t.supportedTopologyInstanceTypePattern.MatchString(instanceType)
144+
}
145+
109146
func (t *instanceTopologyManager) addUnsupported(key string) {
110147
err := t.unsupportedKeyStore.Add(key)
111148
if err != nil {
112149
klog.Errorf("Failed to cache unsupported key %s: %q", key, err)
113150
}
114151
}
115152

116-
func (t *instanceTopologyManager) mightSupportTopology(instanceType string, region string) bool {
153+
func (t *instanceTopologyManager) mightSupportTopology(instanceID string, instanceType string, region string) bool {
117154
// In the case of fargate and possibly other variants, the instance type will be empty.
118155
if len(instanceType) == 0 {
119156
return false
@@ -125,6 +162,12 @@ func (t *instanceTopologyManager) mightSupportTopology(instanceType string, regi
125162
klog.Errorf("Failed to get cached unsupported region: %q:", err)
126163
}
127164

165+
if _, exists, err := t.unsupportedKeyStore.GetByKey(instanceID); exists {
166+
return false
167+
} else if err != nil {
168+
klog.Errorf("Failed to get cached unsupported instance ID: %q:", err)
169+
}
170+
128171
if _, exists, err := t.unsupportedKeyStore.GetByKey(instanceType); exists {
129172
return false
130173
} else if err != nil {

pkg/resourcemanagers/topology_mock.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,3 +37,9 @@ func (m *MockedInstanceTopologyManager) GetNodeTopology(ctx context.Context, ins
3737
}
3838
return args.Get(0).(*types.InstanceTopology), nil
3939
}
40+
41+
// DoesInstanceTypeRequireResponse mocks InstanceTopologyManager.DoesInstanceTypeRequireResponse.
42+
func (m *MockedInstanceTopologyManager) DoesInstanceTypeRequireResponse(instanceType string) bool {
43+
args := m.Called(instanceType)
44+
return args.Get(0).(bool)
45+
}

0 commit comments

Comments
 (0)