Skip to content

Include short unregistered nodes in calculation of incorrect node group #5894

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions cluster-autoscaler/clusterstate/clusterstate.go
Original file line number Diff line number Diff line change
Expand Up @@ -648,8 +648,9 @@ func (csr *ClusterStateRegistry) updateIncorrectNodeGroupSizes(currentTime time.
}
continue
}
if len(readiness.Registered) > acceptableRange.MaxNodes ||
len(readiness.Registered) < acceptableRange.MinNodes {
unregisteredNodes := len(readiness.Unregistered) + len(readiness.LongUnregistered)
if len(readiness.Registered) > acceptableRange.CurrentTarget ||
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why current target and not MaxNodes? It seem like MaxNodes is an obvious fit here, there are no changes to how you calculate it and I don't see why we shouldn't offset for scale-downs that we're aware of (tbh I wouldn't implement the whole acceptableRanges logic if we were starting from scratch, but why not use it if it's already there?

Copy link
Contributor Author

@BigDarkClown BigDarkClown Jun 29, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The idea is that we don't want the incorrect node group size to depend on the ongoing scale-ups and scale-downs. We want to only guard on the expected size. Otherwise we will effectively double our max node provisioning period for this mechanism.

Also in all fairness - the mechanism for fixing situations when current size > expected size is not implemented yet. It's possible that it will require some slightly different edge conditions, but I think it's better to leave figuring them to the person implementing this mechanism.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ack

len(readiness.Registered) < acceptableRange.CurrentTarget-unregisteredNodes {
incorrect := IncorrectNodeGroupSize{
CurrentSize: len(readiness.Registered),
ExpectedSize: acceptableRange.CurrentTarget,
Expand Down
316 changes: 316 additions & 0 deletions cluster-autoscaler/clusterstate/clusterstate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1064,3 +1064,319 @@ func newBackoff() backoff.Backoff {
return backoff.NewIdBasedExponentialBackoff(5*time.Minute, /*InitialNodeGroupBackoffDuration*/
30*time.Minute /*MaxNodeGroupBackoffDuration*/, 3*time.Hour /*NodeGroupBackoffResetTimeout*/)
}

func TestUpdateAcceptableRanges(t *testing.T) {
testCases := []struct {
name string

targetSizes map[string]int
readiness map[string]Readiness
scaleUpRequests map[string]*ScaleUpRequest
scaledDownGroups []string

wantAcceptableRanges map[string]AcceptableRange
}{
{
name: "No scale-ups/scale-downs",
targetSizes: map[string]int{
"ng1": 10,
"ng2": 20,
},
readiness: map[string]Readiness{
"ng1": {Ready: make([]string, 10)},
"ng2": {Ready: make([]string, 20)},
},
wantAcceptableRanges: map[string]AcceptableRange{
"ng1": {MinNodes: 10, MaxNodes: 10, CurrentTarget: 10},
"ng2": {MinNodes: 20, MaxNodes: 20, CurrentTarget: 20},
},
},
{
name: "Ongoing scale-ups",
targetSizes: map[string]int{
"ng1": 10,
"ng2": 20,
},
readiness: map[string]Readiness{
"ng1": {Ready: make([]string, 10)},
"ng2": {Ready: make([]string, 20)},
},
scaleUpRequests: map[string]*ScaleUpRequest{
"ng1": {Increase: 3},
"ng2": {Increase: 5},
},
wantAcceptableRanges: map[string]AcceptableRange{
"ng1": {MinNodes: 7, MaxNodes: 10, CurrentTarget: 10},
"ng2": {MinNodes: 15, MaxNodes: 20, CurrentTarget: 20},
},
},
{
name: "Ongoing scale-downs",
targetSizes: map[string]int{
"ng1": 10,
"ng2": 20,
},
readiness: map[string]Readiness{
"ng1": {Ready: make([]string, 10)},
"ng2": {Ready: make([]string, 20)},
},
scaledDownGroups: []string{"ng1", "ng1", "ng2", "ng2", "ng2"},
wantAcceptableRanges: map[string]AcceptableRange{
"ng1": {MinNodes: 10, MaxNodes: 12, CurrentTarget: 10},
"ng2": {MinNodes: 20, MaxNodes: 23, CurrentTarget: 20},
},
},
{
name: "Some short unregistered nodes",
targetSizes: map[string]int{
"ng1": 10,
"ng2": 20,
},
readiness: map[string]Readiness{
"ng1": {Ready: make([]string, 8), Unregistered: make([]string, 2)},
"ng2": {Ready: make([]string, 17), Unregistered: make([]string, 3)},
},
wantAcceptableRanges: map[string]AcceptableRange{
"ng1": {MinNodes: 10, MaxNodes: 10, CurrentTarget: 10},
"ng2": {MinNodes: 20, MaxNodes: 20, CurrentTarget: 20},
},
},
{
name: "Some long unregistered nodes",
targetSizes: map[string]int{
"ng1": 10,
"ng2": 20,
},
readiness: map[string]Readiness{
"ng1": {Ready: make([]string, 8), LongUnregistered: make([]string, 2)},
"ng2": {Ready: make([]string, 17), LongUnregistered: make([]string, 3)},
},
wantAcceptableRanges: map[string]AcceptableRange{
"ng1": {MinNodes: 8, MaxNodes: 10, CurrentTarget: 10},
"ng2": {MinNodes: 17, MaxNodes: 20, CurrentTarget: 20},
},
},
{
name: "Everything together",
targetSizes: map[string]int{
"ng1": 10,
"ng2": 20,
},
readiness: map[string]Readiness{
"ng1": {Ready: make([]string, 8), Unregistered: make([]string, 1), LongUnregistered: make([]string, 2)},
"ng2": {Ready: make([]string, 17), Unregistered: make([]string, 3), LongUnregistered: make([]string, 4)},
},
scaleUpRequests: map[string]*ScaleUpRequest{
"ng1": {Increase: 3},
"ng2": {Increase: 5},
},
scaledDownGroups: []string{"ng1", "ng1", "ng2", "ng2", "ng2"},
wantAcceptableRanges: map[string]AcceptableRange{
"ng1": {MinNodes: 5, MaxNodes: 12, CurrentTarget: 10},
"ng2": {MinNodes: 11, MaxNodes: 23, CurrentTarget: 20},
},
},
}

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
provider := testprovider.NewTestCloudProvider(nil, nil)
for nodeGroupName, targetSize := range tc.targetSizes {
provider.AddNodeGroup(nodeGroupName, 0, 1000, targetSize)
}
var scaleDownRequests []*ScaleDownRequest
for _, nodeGroupName := range tc.scaledDownGroups {
scaleDownRequests = append(scaleDownRequests, &ScaleDownRequest{
NodeGroup: provider.GetNodeGroup(nodeGroupName),
})
}

clusterState := &ClusterStateRegistry{
cloudProvider: provider,
perNodeGroupReadiness: tc.readiness,
scaleUpRequests: tc.scaleUpRequests,
scaleDownRequests: scaleDownRequests,
}

clusterState.updateAcceptableRanges(tc.targetSizes)
assert.Equal(t, tc.wantAcceptableRanges, clusterState.acceptableRanges)
})
}
}

func TestUpdateIncorrectNodeGroupSizes(t *testing.T) {
timeNow := time.Now()
testCases := []struct {
name string

acceptableRanges map[string]AcceptableRange
readiness map[string]Readiness
incorrectSizes map[string]IncorrectNodeGroupSize

wantIncorrectSizes map[string]IncorrectNodeGroupSize
}{
{
name: "node groups with correct sizes",
acceptableRanges: map[string]AcceptableRange{
"ng1": {CurrentTarget: 10},
"ng2": {CurrentTarget: 20},
},
readiness: map[string]Readiness{
"ng1": {Registered: make([]string, 10)},
"ng2": {Registered: make([]string, 20)},
},
incorrectSizes: map[string]IncorrectNodeGroupSize{},
wantIncorrectSizes: map[string]IncorrectNodeGroupSize{},
},
{
name: "node groups with correct sizes after not being correct sized",
acceptableRanges: map[string]AcceptableRange{
"ng1": {CurrentTarget: 10},
"ng2": {CurrentTarget: 20},
},
readiness: map[string]Readiness{
"ng1": {Registered: make([]string, 10)},
"ng2": {Registered: make([]string, 20)},
},
incorrectSizes: map[string]IncorrectNodeGroupSize{
"ng1": {CurrentSize: 8, ExpectedSize: 10, FirstObserved: timeNow.Add(-time.Hour)},
"ng2": {CurrentSize: 15, ExpectedSize: 20, FirstObserved: timeNow.Add(-time.Minute)},
},
wantIncorrectSizes: map[string]IncorrectNodeGroupSize{},
},
{
name: "node groups below the target size",
acceptableRanges: map[string]AcceptableRange{
"ng1": {CurrentTarget: 10},
"ng2": {CurrentTarget: 20},
},
readiness: map[string]Readiness{
"ng1": {Registered: make([]string, 8)},
"ng2": {Registered: make([]string, 15)},
},
incorrectSizes: map[string]IncorrectNodeGroupSize{},
wantIncorrectSizes: map[string]IncorrectNodeGroupSize{
"ng1": {CurrentSize: 8, ExpectedSize: 10, FirstObserved: timeNow},
"ng2": {CurrentSize: 15, ExpectedSize: 20, FirstObserved: timeNow},
},
},
{
name: "node groups above the target size",
acceptableRanges: map[string]AcceptableRange{
"ng1": {CurrentTarget: 10},
"ng2": {CurrentTarget: 20},
},
readiness: map[string]Readiness{
"ng1": {Registered: make([]string, 12)},
"ng2": {Registered: make([]string, 25)},
},
incorrectSizes: map[string]IncorrectNodeGroupSize{},
wantIncorrectSizes: map[string]IncorrectNodeGroupSize{
"ng1": {CurrentSize: 12, ExpectedSize: 10, FirstObserved: timeNow},
"ng2": {CurrentSize: 25, ExpectedSize: 20, FirstObserved: timeNow},
},
},
{
name: "node groups below the target size with changed delta",
acceptableRanges: map[string]AcceptableRange{
"ng1": {CurrentTarget: 10},
"ng2": {CurrentTarget: 20},
},
readiness: map[string]Readiness{
"ng1": {Registered: make([]string, 8)},
"ng2": {Registered: make([]string, 15)},
},
incorrectSizes: map[string]IncorrectNodeGroupSize{
"ng1": {CurrentSize: 7, ExpectedSize: 10, FirstObserved: timeNow.Add(-time.Hour)},
"ng2": {CurrentSize: 14, ExpectedSize: 20, FirstObserved: timeNow.Add(-time.Minute)},
},
wantIncorrectSizes: map[string]IncorrectNodeGroupSize{
"ng1": {CurrentSize: 8, ExpectedSize: 10, FirstObserved: timeNow},
"ng2": {CurrentSize: 15, ExpectedSize: 20, FirstObserved: timeNow},
},
},
{
name: "node groups below the target size with the same delta",
acceptableRanges: map[string]AcceptableRange{
"ng1": {CurrentTarget: 10},
"ng2": {CurrentTarget: 20},
},
readiness: map[string]Readiness{
"ng1": {Registered: make([]string, 8)},
"ng2": {Registered: make([]string, 15)},
},
incorrectSizes: map[string]IncorrectNodeGroupSize{
"ng1": {CurrentSize: 8, ExpectedSize: 10, FirstObserved: timeNow.Add(-time.Hour)},
"ng2": {CurrentSize: 15, ExpectedSize: 20, FirstObserved: timeNow.Add(-time.Minute)},
},
wantIncorrectSizes: map[string]IncorrectNodeGroupSize{
"ng1": {CurrentSize: 8, ExpectedSize: 10, FirstObserved: timeNow.Add(-time.Hour)},
"ng2": {CurrentSize: 15, ExpectedSize: 20, FirstObserved: timeNow.Add(-time.Minute)},
},
},
{
name: "node groups below the target size with short unregistered nodes",
acceptableRanges: map[string]AcceptableRange{
"ng1": {CurrentTarget: 10},
"ng2": {CurrentTarget: 20},
},
readiness: map[string]Readiness{
"ng1": {Registered: make([]string, 8), Unregistered: make([]string, 2)},
"ng2": {Registered: make([]string, 15), Unregistered: make([]string, 3)},
},
incorrectSizes: map[string]IncorrectNodeGroupSize{},
wantIncorrectSizes: map[string]IncorrectNodeGroupSize{
"ng2": {CurrentSize: 15, ExpectedSize: 20, FirstObserved: timeNow},
},
},
{
name: "node groups below the target size with long unregistered nodes",
acceptableRanges: map[string]AcceptableRange{
"ng1": {CurrentTarget: 10},
"ng2": {CurrentTarget: 20},
},
readiness: map[string]Readiness{
"ng1": {Registered: make([]string, 8), LongUnregistered: make([]string, 2)},
"ng2": {Registered: make([]string, 15), LongUnregistered: make([]string, 3)},
},
incorrectSizes: map[string]IncorrectNodeGroupSize{},
wantIncorrectSizes: map[string]IncorrectNodeGroupSize{
"ng2": {CurrentSize: 15, ExpectedSize: 20, FirstObserved: timeNow},
},
},
{
name: "node groups below the target size with various unregistered nodes",
acceptableRanges: map[string]AcceptableRange{
"ng1": {CurrentTarget: 10},
"ng2": {CurrentTarget: 20},
},
readiness: map[string]Readiness{
"ng1": {Registered: make([]string, 8), Unregistered: make([]string, 1), LongUnregistered: make([]string, 1)},
"ng2": {Registered: make([]string, 15), Unregistered: make([]string, 2), LongUnregistered: make([]string, 2)},
},
incorrectSizes: map[string]IncorrectNodeGroupSize{},
wantIncorrectSizes: map[string]IncorrectNodeGroupSize{
"ng2": {CurrentSize: 15, ExpectedSize: 20, FirstObserved: timeNow},
},
},
}

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
provider := testprovider.NewTestCloudProvider(nil, nil)
for nodeGroupName, acceptableRange := range tc.acceptableRanges {
provider.AddNodeGroup(nodeGroupName, 0, 1000, acceptableRange.CurrentTarget)
}

clusterState := &ClusterStateRegistry{
cloudProvider: provider,
acceptableRanges: tc.acceptableRanges,
perNodeGroupReadiness: tc.readiness,
incorrectNodeGroupSizes: tc.incorrectSizes,
}

clusterState.updateIncorrectNodeGroupSizes(timeNow)
assert.Equal(t, tc.wantIncorrectSizes, clusterState.incorrectNodeGroupSizes)
})
}
}