@@ -20,18 +20,33 @@ import (
2020 "context"
2121 "errors"
2222 "fmt"
23+ "regexp"
2324 "time"
2425
2526 "github.com/aws/aws-sdk-go-v2/service/ec2"
2627 "github.com/aws/aws-sdk-go-v2/service/ec2/types"
2728 "github.com/aws/smithy-go"
2829 "k8s.io/client-go/tools/cache"
30+ "k8s.io/cloud-provider-aws/pkg/providers/v1/config"
2931 "k8s.io/cloud-provider-aws/pkg/services"
3032 "k8s.io/klog/v2"
3133)
3234
3335const instanceTopologyManagerCacheTimeout = 24 * time .Hour
3436
37+ /*
38+ We need to ensure that instance types that we expect a response will not successfully complete syncing unless
39+ we get a response, so we can track known instance types that we expect to get a response for.
40+
41+ Supported instance types for DescribeInstanceTopology as of 2/6/25 from API documentation:
42+ https://docs.aws.amazon.com/AWSEC2/latest/APIReference/API_DescribeInstanceTopology.html
43+
44+ hpc6a.48xlarge | hpc6id.32xlarge | hpc7a.12xlarge | hpc7a.24xlarge | hpc7a.48xlarge | hpc7a.96xlarge | hpc7g.4xlarge | hpc7g.8xlarge | hpc7g.16xlarge
45+ p3dn.24xlarge | p4d.24xlarge | p4de.24xlarge | p5.48xlarge | p5e.48xlarge | p5en.48xlarge
46+ trn1.2xlarge | trn1.32xlarge | trn1n.32xlarge | trn2.48xlarge | trn2u.48xlarge
47+ */
48+ var defaultSupportedTopologyInstanceTypePattern = regexp .MustCompile (`^(hpc|trn|p|inf)[0-9]+[a-z]*(\.[0-9a-z]*)$` )
49+
3550// stringKeyFunc is a string as cache key function
3651func topStringKeyFunc (obj interface {}) (string , error ) {
3752 // Type should already be a string, so just return as is.
@@ -46,26 +61,36 @@ func topStringKeyFunc(obj interface{}) (string, error) {
4661// InstanceTopologyManager enables mocking the InstanceTopologyManager.
4762type InstanceTopologyManager interface {
4863 GetNodeTopology (ctx context.Context , instanceType string , region string , instanceID string ) (* types.InstanceTopology , error )
64+ DoesInstanceTypeRequireResponse (instanceType string ) bool
4965}
5066
5167// instanceTopologyManager manages getting instance topology for nodes.
5268type instanceTopologyManager struct {
53- ec2 services.Ec2SdkV2
54- unsupportedKeyStore cache.Store
69+ ec2 services.Ec2SdkV2
70+ unsupportedKeyStore cache.Store
71+ supportedTopologyInstanceTypePattern * regexp.Regexp
5572}
5673
5774// NewInstanceTopologyManager generates a new InstanceTopologyManager.
58- func NewInstanceTopologyManager (ec2 services.Ec2SdkV2 ) InstanceTopologyManager {
75+ func NewInstanceTopologyManager (ec2 services.Ec2SdkV2 , cfg * config.CloudConfig ) InstanceTopologyManager {
76+ var supportedTopologyInstanceTypePattern * regexp.Regexp
77+ if cfg .Global .SupportedTopologyInstanceTypePattern != "" {
78+ supportedTopologyInstanceTypePattern = regexp .MustCompile (cfg .Global .SupportedTopologyInstanceTypePattern )
79+ } else {
80+ supportedTopologyInstanceTypePattern = defaultSupportedTopologyInstanceTypePattern
81+ }
82+
5983 return & instanceTopologyManager {
60- ec2 : ec2 ,
84+ ec2 : ec2 ,
85+ supportedTopologyInstanceTypePattern : supportedTopologyInstanceTypePattern ,
6186 // These should change very infrequently, if ever, so checking once a day sounds fair.
6287 unsupportedKeyStore : cache .NewTTLStore (topStringKeyFunc , instanceTopologyManagerCacheTimeout ),
6388 }
6489}
6590
6691// GetNodeTopology gets the instance topology for a node.
6792func (t * instanceTopologyManager ) GetNodeTopology (ctx context.Context , instanceType string , region string , instanceID string ) (* types.InstanceTopology , error ) {
68- if t .mightSupportTopology (instanceType , region ) {
93+ if t .mightSupportTopology (instanceID , instanceType , region ) {
6994 request := & ec2.DescribeInstanceTopologyInput {InstanceIds : []string {instanceID }}
7095 topologies , err := t .ec2 .DescribeInstanceTopology (ctx , request )
7196 if err != nil {
@@ -85,19 +110,26 @@ func (t *instanceTopologyManager) GetNodeTopology(ctx context.Context, instanceT
85110 t .addUnsupported (region )
86111 return nil , nil
87112 case "RequestLimitExceeded" :
88- // Gracefully handle request throttling
89113 klog .Warningf ("Exceeded ec2:DescribeInstanceTopology request limits. Try again later: %q" , err )
90- return nil , nil
114+ return nil , err
91115 }
92116 }
93117
94118 // Unhandled error
95119 klog .Errorf ("Error describing instance topology: %q" , err )
96120 return nil , err
97121 } else if len (topologies ) == 0 {
98- // If no topology is returned, track the instance type as unsupported
99- klog .Infof ("Instance type %s unsupported for getting instance topology" , instanceType )
100- t .addUnsupported (instanceType )
122+ // If no topology is returned, track the instance type as unsupported if we don't require a response.
123+ if t .DoesInstanceTypeRequireResponse (instanceType ) {
124+ // While the instance type could be unsupported, it's also possible that the instance is deleting or shut down
125+ // and has no active instance topology. In this case, we don't want to track it as unsupported.
126+ klog .Warningf ("Instance %s of type %s has no instance topology listed but may be a supported type." , instanceID , instanceType )
127+ // Track that the instance ID is does not include a response. This will prevent us from calling again unnecessarily.
128+ t .addUnsupported (instanceID )
129+ } else {
130+ klog .Infof ("Instance type %s unsupported for getting instance topology" , instanceType )
131+ t .addUnsupported (instanceType )
132+ }
101133 return nil , nil
102134 }
103135
@@ -106,14 +138,19 @@ func (t *instanceTopologyManager) GetNodeTopology(ctx context.Context, instanceT
106138 return nil , nil
107139}
108140
141+ // DoesInstanceTypeRequireResponse verifies whether or not we expect an instance to have an instance topology response.
142+ func (t * instanceTopologyManager ) DoesInstanceTypeRequireResponse (instanceType string ) bool {
143+ return t .supportedTopologyInstanceTypePattern .MatchString (instanceType )
144+ }
145+
109146func (t * instanceTopologyManager ) addUnsupported (key string ) {
110147 err := t .unsupportedKeyStore .Add (key )
111148 if err != nil {
112149 klog .Errorf ("Failed to cache unsupported key %s: %q" , key , err )
113150 }
114151}
115152
116- func (t * instanceTopologyManager ) mightSupportTopology (instanceType string , region string ) bool {
153+ func (t * instanceTopologyManager ) mightSupportTopology (instanceID string , instanceType string , region string ) bool {
117154 // In the case of fargate and possibly other variants, the instance type will be empty.
118155 if len (instanceType ) == 0 {
119156 return false
@@ -125,6 +162,12 @@ func (t *instanceTopologyManager) mightSupportTopology(instanceType string, regi
125162 klog .Errorf ("Failed to get cached unsupported region: %q:" , err )
126163 }
127164
165+ if _ , exists , err := t .unsupportedKeyStore .GetByKey (instanceID ); exists {
166+ return false
167+ } else if err != nil {
168+ klog .Errorf ("Failed to get cached unsupported instance ID: %q:" , err )
169+ }
170+
128171 if _ , exists , err := t .unsupportedKeyStore .GetByKey (instanceType ); exists {
129172 return false
130173 } else if err != nil {
0 commit comments