Skip to content

Commit f9238e5

Browse files
committed
Include short unregistered nodes in calculation of incorrect node group
sizes
1 parent 0733535 commit f9238e5

File tree

2 files changed

+281
-13
lines changed

2 files changed

+281
-13
lines changed

cluster-autoscaler/clusterstate/clusterstate.go

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -373,7 +373,7 @@ func (csr *ClusterStateRegistry) IsClusterHealthy() bool {
373373
totalUnready := len(csr.totalReadiness.Unready)
374374

375375
if totalUnready > csr.config.OkTotalUnreadyCount &&
376-
float64(totalUnready) > csr.config.MaxTotalUnreadyPercentage/100.0*float64(len(csr.nodes)) {
376+
float64(totalUnready) > csr.config.MaxTotalUnreadyPercentage/100.0*float64(len(csr.nodes)) {
377377
return false
378378
}
379379

@@ -406,8 +406,8 @@ func (csr *ClusterStateRegistry) IsNodeGroupHealthy(nodeGroupName string) bool {
406406
// TODO: verify against max nodes as well.
407407

408408
if unjustifiedUnready > csr.config.OkTotalUnreadyCount &&
409-
float64(unjustifiedUnready) > csr.config.MaxTotalUnreadyPercentage/100.0*
410-
float64(len(readiness.Ready)+len(readiness.Unready)+len(readiness.NotStarted)) {
409+
float64(unjustifiedUnready) > csr.config.MaxTotalUnreadyPercentage/100.0*
410+
float64(len(readiness.Ready)+len(readiness.Unready)+len(readiness.NotStarted)) {
411411
return false
412412
}
413413

@@ -510,9 +510,8 @@ func (csr *ClusterStateRegistry) updateAcceptableRanges(targetSize map[string]in
510510
result := make(map[string]AcceptableRange)
511511
for _, nodeGroup := range csr.cloudProvider.NodeGroups() {
512512
size := targetSize[nodeGroup.Id()]
513-
readiness := csr.perNodeGroupReadiness[nodeGroup.Id()]
514513
result[nodeGroup.Id()] = AcceptableRange{
515-
MinNodes: size - len(readiness.LongUnregistered),
514+
MinNodes: size,
516515
MaxNodes: size,
517516
CurrentTarget: size,
518517
}
@@ -648,8 +647,9 @@ func (csr *ClusterStateRegistry) updateIncorrectNodeGroupSizes(currentTime time.
648647
}
649648
continue
650649
}
651-
if len(readiness.Registered) > acceptableRange.MaxNodes ||
652-
len(readiness.Registered) < acceptableRange.MinNodes {
650+
unregisteredNodes := len(readiness.Unregistered) + len(readiness.LongUnregistered)
651+
if len(readiness.Registered) > acceptableRange.CurrentTarget ||
652+
len(readiness.Registered) < acceptableRange.CurrentTarget-unregisteredNodes {
653653
incorrect := IncorrectNodeGroupSize{
654654
CurrentSize: len(readiness.Registered),
655655
ExpectedSize: acceptableRange.CurrentTarget,
@@ -658,7 +658,7 @@ func (csr *ClusterStateRegistry) updateIncorrectNodeGroupSizes(currentTime time.
658658
existing, found := csr.incorrectNodeGroupSizes[nodeGroup.Id()]
659659
if found {
660660
if incorrect.CurrentSize == existing.CurrentSize &&
661-
incorrect.ExpectedSize == existing.ExpectedSize {
661+
incorrect.ExpectedSize == existing.ExpectedSize {
662662
incorrect = existing
663663
}
664664
}
@@ -847,7 +847,7 @@ func buildScaleUpStatusClusterwide(nodeGroupStatuses []api.NodeGroupStatus, read
847847
for _, nodeGroupStatuses := range nodeGroupStatuses {
848848
for _, condition := range nodeGroupStatuses.Conditions {
849849
if condition.Type == api.ClusterAutoscalerScaleUp &&
850-
condition.Status == api.ClusterAutoscalerInProgress {
850+
condition.Status == api.ClusterAutoscalerInProgress {
851851
isScaleUpInProgress = true
852852
}
853853
}
@@ -1046,10 +1046,10 @@ func (csr *ClusterStateRegistry) handleInstanceCreationErrors(currentTime time.T
10461046
}
10471047

10481048
func (csr *ClusterStateRegistry) handleInstanceCreationErrorsForNodeGroup(
1049-
nodeGroup cloudprovider.NodeGroup,
1050-
currentInstances []cloudprovider.Instance,
1051-
previousInstances []cloudprovider.Instance,
1052-
currentTime time.Time) {
1049+
nodeGroup cloudprovider.NodeGroup,
1050+
currentInstances []cloudprovider.Instance,
1051+
previousInstances []cloudprovider.Instance,
1052+
currentTime time.Time) {
10531053

10541054
_, currentUniqueErrorMessagesForErrorCode, currentErrorCodeToInstance := csr.buildInstanceToErrorCodeMappings(currentInstances)
10551055
previousInstanceToErrorCode, _, _ := csr.buildInstanceToErrorCodeMappings(previousInstances)

cluster-autoscaler/clusterstate/clusterstate_test.go

Lines changed: 268 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1064,3 +1064,271 @@ func newBackoff() backoff.Backoff {
10641064
return backoff.NewIdBasedExponentialBackoff(5*time.Minute, /*InitialNodeGroupBackoffDuration*/
10651065
30*time.Minute /*MaxNodeGroupBackoffDuration*/, 3*time.Hour /*NodeGroupBackoffResetTimeout*/)
10661066
}
1067+
1068+
func TestUpdateAcceptableRanges(t *testing.T) {
1069+
testCases := []struct {
1070+
name string
1071+
1072+
targetSizes map[string]int
1073+
scaleUpRequests map[string]*ScaleUpRequest
1074+
scaledDownGroups []string
1075+
1076+
wantAcceptableRanges map[string]AcceptableRange
1077+
}{
1078+
{
1079+
name: "No scale-ups/scale-downs",
1080+
targetSizes: map[string]int{
1081+
"ng1": 10,
1082+
"ng2": 20,
1083+
},
1084+
wantAcceptableRanges: map[string]AcceptableRange{
1085+
"ng1": {MinNodes: 10, MaxNodes: 10, CurrentTarget: 10},
1086+
"ng2": {MinNodes: 20, MaxNodes: 20, CurrentTarget: 20},
1087+
},
1088+
},
1089+
{
1090+
name: "Ongoing scale-ups",
1091+
targetSizes: map[string]int{
1092+
"ng1": 10,
1093+
"ng2": 20,
1094+
},
1095+
scaleUpRequests: map[string]*ScaleUpRequest{
1096+
"ng1": {Increase: 3},
1097+
"ng2": {Increase: 5},
1098+
},
1099+
wantAcceptableRanges: map[string]AcceptableRange{
1100+
"ng1": {MinNodes: 7, MaxNodes: 10, CurrentTarget: 10},
1101+
"ng2": {MinNodes: 15, MaxNodes: 20, CurrentTarget: 20},
1102+
},
1103+
},
1104+
{
1105+
name: "Ongoing scale-downs",
1106+
targetSizes: map[string]int{
1107+
"ng1": 10,
1108+
"ng2": 20,
1109+
},
1110+
scaledDownGroups: []string{"ng1", "ng1", "ng2", "ng2", "ng2"},
1111+
wantAcceptableRanges: map[string]AcceptableRange{
1112+
"ng1": {MinNodes: 10, MaxNodes: 12, CurrentTarget: 10},
1113+
"ng2": {MinNodes: 20, MaxNodes: 23, CurrentTarget: 20},
1114+
},
1115+
},
1116+
{
1117+
name: "Everything together",
1118+
targetSizes: map[string]int{
1119+
"ng1": 10,
1120+
"ng2": 20,
1121+
},
1122+
scaleUpRequests: map[string]*ScaleUpRequest{
1123+
"ng1": {Increase: 3},
1124+
"ng2": {Increase: 5},
1125+
},
1126+
scaledDownGroups: []string{"ng1", "ng1", "ng2", "ng2", "ng2"},
1127+
wantAcceptableRanges: map[string]AcceptableRange{
1128+
"ng1": {MinNodes: 7, MaxNodes: 12, CurrentTarget: 10},
1129+
"ng2": {MinNodes: 15, MaxNodes: 23, CurrentTarget: 20},
1130+
},
1131+
},
1132+
}
1133+
1134+
for _, tc := range testCases {
1135+
t.Run(tc.name, func(t *testing.T) {
1136+
provider := testprovider.NewTestCloudProvider(nil, nil)
1137+
for nodeGroupName, targetSize := range tc.targetSizes {
1138+
provider.AddNodeGroup(nodeGroupName, 0, 1000, targetSize)
1139+
}
1140+
var scaleDownRequests []*ScaleDownRequest
1141+
for _, nodeGroupName := range tc.scaledDownGroups {
1142+
scaleDownRequests = append(scaleDownRequests, &ScaleDownRequest{
1143+
NodeGroup: provider.GetNodeGroup(nodeGroupName),
1144+
})
1145+
}
1146+
1147+
clusterState := &ClusterStateRegistry{
1148+
cloudProvider: provider,
1149+
scaleUpRequests: tc.scaleUpRequests,
1150+
scaleDownRequests: scaleDownRequests,
1151+
}
1152+
1153+
clusterState.updateAcceptableRanges(tc.targetSizes)
1154+
assert.Equal(t, tc.wantAcceptableRanges, clusterState.acceptableRanges)
1155+
})
1156+
}
1157+
}
1158+
1159+
func TestUpdateIncorrectNodeGroupSizes(t *testing.T) {
1160+
timeNow := time.Now()
1161+
testCases := []struct {
1162+
name string
1163+
1164+
acceptableRanges map[string]AcceptableRange
1165+
readiness map[string]Readiness
1166+
incorrectSizes map[string]IncorrectNodeGroupSize
1167+
1168+
wantIncorrectSizes map[string]IncorrectNodeGroupSize
1169+
}{
1170+
{
1171+
name: "node groups with correct sizes",
1172+
acceptableRanges: map[string]AcceptableRange{
1173+
"ng1": {CurrentTarget: 10},
1174+
"ng2": {CurrentTarget: 20},
1175+
},
1176+
readiness: map[string]Readiness{
1177+
"ng1": {Registered: make([]string, 10)},
1178+
"ng2": {Registered: make([]string, 20)},
1179+
},
1180+
incorrectSizes: map[string]IncorrectNodeGroupSize{},
1181+
wantIncorrectSizes: map[string]IncorrectNodeGroupSize{},
1182+
},
1183+
{
1184+
name: "node groups with correct sizes after not being correct sized",
1185+
acceptableRanges: map[string]AcceptableRange{
1186+
"ng1": {CurrentTarget: 10},
1187+
"ng2": {CurrentTarget: 20},
1188+
},
1189+
readiness: map[string]Readiness{
1190+
"ng1": {Registered: make([]string, 10)},
1191+
"ng2": {Registered: make([]string, 20)},
1192+
},
1193+
incorrectSizes: map[string]IncorrectNodeGroupSize{
1194+
"ng1": {CurrentSize: 8, ExpectedSize: 10, FirstObserved: timeNow.Add(-time.Hour)},
1195+
"ng2": {CurrentSize: 15, ExpectedSize: 20, FirstObserved: timeNow.Add(-time.Minute)},
1196+
},
1197+
wantIncorrectSizes: map[string]IncorrectNodeGroupSize{},
1198+
},
1199+
{
1200+
name: "node groups below the target size",
1201+
acceptableRanges: map[string]AcceptableRange{
1202+
"ng1": {CurrentTarget: 10},
1203+
"ng2": {CurrentTarget: 20},
1204+
},
1205+
readiness: map[string]Readiness{
1206+
"ng1": {Registered: make([]string, 8)},
1207+
"ng2": {Registered: make([]string, 15)},
1208+
},
1209+
incorrectSizes: map[string]IncorrectNodeGroupSize{},
1210+
wantIncorrectSizes: map[string]IncorrectNodeGroupSize{
1211+
"ng1": {CurrentSize: 8, ExpectedSize: 10, FirstObserved: timeNow},
1212+
"ng2": {CurrentSize: 15, ExpectedSize: 20, FirstObserved: timeNow},
1213+
},
1214+
},
1215+
{
1216+
name: "node groups above the target size",
1217+
acceptableRanges: map[string]AcceptableRange{
1218+
"ng1": {CurrentTarget: 10},
1219+
"ng2": {CurrentTarget: 20},
1220+
},
1221+
readiness: map[string]Readiness{
1222+
"ng1": {Registered: make([]string, 12)},
1223+
"ng2": {Registered: make([]string, 25)},
1224+
},
1225+
incorrectSizes: map[string]IncorrectNodeGroupSize{},
1226+
wantIncorrectSizes: map[string]IncorrectNodeGroupSize{
1227+
"ng1": {CurrentSize: 12, ExpectedSize: 10, FirstObserved: timeNow},
1228+
"ng2": {CurrentSize: 25, ExpectedSize: 20, FirstObserved: timeNow},
1229+
},
1230+
},
1231+
{
1232+
name: "node groups below the target size with changed delta",
1233+
acceptableRanges: map[string]AcceptableRange{
1234+
"ng1": {CurrentTarget: 10},
1235+
"ng2": {CurrentTarget: 20},
1236+
},
1237+
readiness: map[string]Readiness{
1238+
"ng1": {Registered: make([]string, 8)},
1239+
"ng2": {Registered: make([]string, 15)},
1240+
},
1241+
incorrectSizes: map[string]IncorrectNodeGroupSize{
1242+
"ng1": {CurrentSize: 7, ExpectedSize: 10, FirstObserved: timeNow.Add(-time.Hour)},
1243+
"ng2": {CurrentSize: 14, ExpectedSize: 20, FirstObserved: timeNow.Add(-time.Minute)},
1244+
},
1245+
wantIncorrectSizes: map[string]IncorrectNodeGroupSize{
1246+
"ng1": {CurrentSize: 8, ExpectedSize: 10, FirstObserved: timeNow},
1247+
"ng2": {CurrentSize: 15, ExpectedSize: 20, FirstObserved: timeNow},
1248+
},
1249+
},
1250+
{
1251+
name: "node groups below the target size with the same delta",
1252+
acceptableRanges: map[string]AcceptableRange{
1253+
"ng1": {CurrentTarget: 10},
1254+
"ng2": {CurrentTarget: 20},
1255+
},
1256+
readiness: map[string]Readiness{
1257+
"ng1": {Registered: make([]string, 8)},
1258+
"ng2": {Registered: make([]string, 15)},
1259+
},
1260+
incorrectSizes: map[string]IncorrectNodeGroupSize{
1261+
"ng1": {CurrentSize: 8, ExpectedSize: 10, FirstObserved: timeNow.Add(-time.Hour)},
1262+
"ng2": {CurrentSize: 15, ExpectedSize: 20, FirstObserved: timeNow.Add(-time.Minute)},
1263+
},
1264+
wantIncorrectSizes: map[string]IncorrectNodeGroupSize{
1265+
"ng1": {CurrentSize: 8, ExpectedSize: 10, FirstObserved: timeNow.Add(-time.Hour)},
1266+
"ng2": {CurrentSize: 15, ExpectedSize: 20, FirstObserved: timeNow.Add(-time.Minute)},
1267+
},
1268+
},
1269+
{
1270+
name: "node groups below the target size with short unregistered nodes",
1271+
acceptableRanges: map[string]AcceptableRange{
1272+
"ng1": {CurrentTarget: 10},
1273+
"ng2": {CurrentTarget: 20},
1274+
},
1275+
readiness: map[string]Readiness{
1276+
"ng1": {Registered: make([]string, 8), Unregistered: make([]string, 2)},
1277+
"ng2": {Registered: make([]string, 15), Unregistered: make([]string, 3)},
1278+
},
1279+
incorrectSizes: map[string]IncorrectNodeGroupSize{},
1280+
wantIncorrectSizes: map[string]IncorrectNodeGroupSize{
1281+
"ng2": {CurrentSize: 15, ExpectedSize: 20, FirstObserved: timeNow},
1282+
},
1283+
},
1284+
{
1285+
name: "node groups below the target size with long unregistered nodes",
1286+
acceptableRanges: map[string]AcceptableRange{
1287+
"ng1": {CurrentTarget: 10},
1288+
"ng2": {CurrentTarget: 20},
1289+
},
1290+
readiness: map[string]Readiness{
1291+
"ng1": {Registered: make([]string, 8), LongUnregistered: make([]string, 2)},
1292+
"ng2": {Registered: make([]string, 15), LongUnregistered: make([]string, 3)},
1293+
},
1294+
incorrectSizes: map[string]IncorrectNodeGroupSize{},
1295+
wantIncorrectSizes: map[string]IncorrectNodeGroupSize{
1296+
"ng2": {CurrentSize: 15, ExpectedSize: 20, FirstObserved: timeNow},
1297+
},
1298+
},
1299+
{
1300+
name: "node groups below the target size with various unregistered nodes",
1301+
acceptableRanges: map[string]AcceptableRange{
1302+
"ng1": {CurrentTarget: 10},
1303+
"ng2": {CurrentTarget: 20},
1304+
},
1305+
readiness: map[string]Readiness{
1306+
"ng1": {Registered: make([]string, 8), Unregistered: make([]string, 1), LongUnregistered: make([]string, 1)},
1307+
"ng2": {Registered: make([]string, 15), Unregistered: make([]string, 2), LongUnregistered: make([]string, 2)},
1308+
},
1309+
incorrectSizes: map[string]IncorrectNodeGroupSize{},
1310+
wantIncorrectSizes: map[string]IncorrectNodeGroupSize{
1311+
"ng2": {CurrentSize: 15, ExpectedSize: 20, FirstObserved: timeNow},
1312+
},
1313+
},
1314+
}
1315+
1316+
for _, tc := range testCases {
1317+
t.Run(tc.name, func(t *testing.T) {
1318+
provider := testprovider.NewTestCloudProvider(nil, nil)
1319+
for nodeGroupName, acceptableRange := range tc.acceptableRanges {
1320+
provider.AddNodeGroup(nodeGroupName, 0, 1000, acceptableRange.CurrentTarget)
1321+
}
1322+
1323+
clusterState := &ClusterStateRegistry{
1324+
cloudProvider: provider,
1325+
acceptableRanges: tc.acceptableRanges,
1326+
perNodeGroupReadiness: tc.readiness,
1327+
incorrectNodeGroupSizes: tc.incorrectSizes,
1328+
}
1329+
1330+
clusterState.updateIncorrectNodeGroupSizes(timeNow)
1331+
assert.Equal(t, tc.wantIncorrectSizes, clusterState.incorrectNodeGroupSizes)
1332+
})
1333+
}
1334+
}

0 commit comments

Comments
 (0)