Skip to content

Commit 96cd0c0

Browse files
committed
ignore autoscaler for hyperpod instance
1 parent 347db2d commit 96cd0c0

File tree

2 files changed

+43
-5
lines changed

2 files changed

+43
-5
lines changed

cluster-autoscaler/cloudprovider/aws/aws_cloud_provider.go

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,13 @@ func (aws *awsCloudProvider) NodeGroupForNode(node *apiv1.Node) (cloudprovider.N
115115
klog.Warningf("Node %v has no providerId", node.Name)
116116
return nil, nil
117117
}
118+
119+
// Skip SageMaker instances
120+
if strings.Contains(node.Spec.ProviderID, "/sagemaker") {
121+
klog.V(4).Infof("Skipping SageMaker node %s", node.Name)
122+
return nil, nil
123+
}
124+
118125
ref, err := AwsRefFromProviderId(node.Spec.ProviderID)
119126
if err != nil {
120127
// Dropping this into V as it will be noisy with many Hybrid Nodes
@@ -143,6 +150,11 @@ func (aws *awsCloudProvider) HasInstance(node *apiv1.Node) (bool, error) {
143150
return true, cloudprovider.ErrNotImplemented
144151
}
145152

153+
// Skip SageMaker instances
154+
if strings.Contains(node.Spec.ProviderID, "/sagemaker") {
155+
return true, cloudprovider.ErrNotImplemented
156+
}
157+
146158
// avoid log spam for not autoscaled asgs:
147159
// Nodes that belong to an asg that is not autoscaled will not be found in the asgCache below,
148160
// so do not trigger warning spam by returning an error from being unable to find them.
@@ -209,6 +221,14 @@ var validAwsRefIdRegex = regexp.MustCompile(fmt.Sprintf(`^aws\:\/\/\/[-0-9a-z]*\
209221
// AwsRefFromProviderId creates AwsInstanceRef object from provider id which
210222
// must be in format: aws:///zone/name
211223
func AwsRefFromProviderId(id string) (*AwsInstanceRef, error) {
224+
// Special case for SageMaker format: aws:///<region>/sagemaker/...
225+
if strings.HasPrefix(id, "aws:///") && strings.Contains(id, "/sagemaker") {
226+
return &AwsInstanceRef{
227+
ProviderID: id,
228+
Name: "sagemaker-node",
229+
}, nil
230+
}
231+
212232
if validAwsRefIdRegex.FindStringSubmatch(id) == nil {
213233
return nil, fmt.Errorf("wrong id: expected format aws:///<zone>/<name>, got %v", id)
214234
}
@@ -313,6 +333,11 @@ func (ng *AwsNodeGroup) DecreaseTargetSize(delta int) error {
313333

314334
// Belongs returns true if the given node belongs to the NodeGroup.
315335
func (ng *AwsNodeGroup) Belongs(node *apiv1.Node) (bool, error) {
336+
// Skip SageMaker instances
337+
if strings.Contains(node.Spec.ProviderID, "/sagemaker") {
338+
return false, nil
339+
}
340+
316341
ref, err := AwsRefFromProviderId(node.Spec.ProviderID)
317342
if err != nil {
318343
return false, err

cluster-autoscaler/cloudprovider/aws/aws_cloud_provider_test.go

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -725,21 +725,34 @@ func TestHasInstance(t *testing.T) {
725725
assert.Equal(t, cloudprovider.ErrNotImplemented, err)
726726
assert.True(t, present)
727727

728-
// Case 3: correct node - not present in AWS
728+
// Case 3: incorrect node - sagemaker hyperpod is unsupported
729729
node3 := &apiv1.Node{
730+
ObjectMeta: metav1.ObjectMeta{
731+
Name: "hyperpod-node-1",
732+
},
733+
Spec: apiv1.NodeSpec{
734+
ProviderID: "aws:///use1-az2/sagemaker/cluster/hyperpod-abc123-i-abc123",
735+
},
736+
}
737+
present, err = provider.HasInstance(node3)
738+
assert.Equal(t, cloudprovider.ErrNotImplemented, err)
739+
assert.True(t, present)
740+
741+
// Case 4: correct node - not present in AWS
742+
node4 := &apiv1.Node{
730743
ObjectMeta: metav1.ObjectMeta{
731744
Name: "node-2",
732745
},
733746
Spec: apiv1.NodeSpec{
734747
ProviderID: "aws:///us-east-1a/test-instance-id-2",
735748
},
736749
}
737-
present, err = provider.HasInstance(node3)
750+
present, err = provider.HasInstance(node4)
738751
assert.ErrorContains(t, err, nodeNotPresentErr)
739752
assert.False(t, present)
740753

741-
// Case 4: correct node - not autoscaled -> not present in AWS -> no warning
742-
node4 := &apiv1.Node{
754+
// Case 5: correct node - not autoscaled -> not present in AWS -> no warning
755+
node5 := &apiv1.Node{
743756
ObjectMeta: metav1.ObjectMeta{
744757
Name: "node-2",
745758
Annotations: map[string]string{
@@ -750,7 +763,7 @@ func TestHasInstance(t *testing.T) {
750763
ProviderID: "aws:///us-east-1a/test-instance-id-2",
751764
},
752765
}
753-
present, err = provider.HasInstance(node4)
766+
present, err = provider.HasInstance(node5)
754767
assert.NoError(t, err)
755768
assert.False(t, present)
756769
}

0 commit comments

Comments
 (0)