kubernetes · k8s-ci-robot · Jun 30, 2023 · May 31, 2023 · May 31, 2023 · Jun 1, 2023
diff --git a/cluster-autoscaler/cloudprovider/test/test_cloud_provider.go b/cluster-autoscaler/cloudprovider/test/test_cloud_provider.go
@@ -443,7 +443,7 @@ func (tng *TestNodeGroup) DeleteNodes(nodes []*apiv1.Node) error {
 	id := tng.id
 	tng.targetSize -= len(nodes)
 	tng.Unlock()
-	if tng.opts != nil && tng.opts.AtomicScaling && tng.targetSize != 0 {
+	if tng.opts != nil && tng.opts.ZeroOrMaxNodeScaling && tng.targetSize != 0 {
 		return fmt.Errorf("TestNodeGroup: attempted to partially scale down a node group that should be scaled down atomically")
 	}
 	for _, node := range nodes {

diff --git a/cluster-autoscaler/config/autoscaling_options.go b/cluster-autoscaler/config/autoscaling_options.go
@@ -46,8 +46,8 @@ type NodeGroupAutoscalingOptions struct {
 	ScaleDownUnreadyTime time.Duration
 	// Maximum time CA waits for node to be provisioned
 	MaxNodeProvisionTime time.Duration
-	// AtomicScaling means that all nodes should be provisioned and brought down all at once instead of one-by-one
-	AtomicScaling bool
+	// ZeroOrMaxNodeScaling means that a node group should be scaled up to maximum size or down to zero nodes all at once instead of one-by-one.
+	ZeroOrMaxNodeScaling bool
 }
 
 // GCEOptions contain autoscaling options specific to GCE cloud provider.

diff --git a/cluster-autoscaler/core/scaledown/actuation/actuator_test.go b/cluster-autoscaler/core/scaledown/actuation/actuator_test.go
@@ -1103,7 +1103,7 @@ func TestStartDeletionInBatchBasic(t *testing.T) {
 func sizedNodeGroup(id string, size int, atomic bool) cloudprovider.NodeGroup {
 	ng := testprovider.NewTestNodeGroup(id, 10000, 0, size, true, false, "n1-standard-2", nil, nil)
 	ng.SetOptions(&config.NodeGroupAutoscalingOptions{
-		AtomicScaling: atomic,
+		ZeroOrMaxNodeScaling: atomic,
 	})
 	return ng
 }

diff --git a/cluster-autoscaler/core/scaledown/actuation/group_deletion_scheduler.go b/cluster-autoscaler/core/scaledown/actuation/group_deletion_scheduler.go
@@ -79,7 +79,7 @@ func (ds *GroupDeletionScheduler) ScheduleDeletion(nodeInfo *framework.NodeInfo,
 		return
 	}
 
-	ds.addToBatcher(nodeInfo, nodeGroup, batchSize, drain, opts.AtomicScaling)
+	ds.addToBatcher(nodeInfo, nodeGroup, batchSize, drain, opts.ZeroOrMaxNodeScaling)
 }
 
 // prepareNodeForDeletion is a long-running operation, so it needs to avoid locking the AtomicDeletionScheduler object

diff --git a/cluster-autoscaler/core/scaledown/budgets/budgets.go b/cluster-autoscaler/core/scaledown/budgets/budgets.go
@@ -173,7 +173,7 @@ func (bp *ScaleDownBudgetProcessor) categorize(groups []*NodeGroupView) (individ
 			klog.Errorf("Failed to get autoscaling options for node group %s: %v", view.Group.Id(), err)
 			continue
 		}
-		if autoscalingOptions != nil && autoscalingOptions.AtomicScaling {
+		if autoscalingOptions != nil && autoscalingOptions.ZeroOrMaxNodeScaling {
 			atomic = append(atomic, view)
 		} else {
 			individual = append(individual, view)
@@ -196,7 +196,7 @@ func (bp *ScaleDownBudgetProcessor) groupByNodeGroup(nodes []*apiv1.Node) (indiv
 			klog.Errorf("Failed to get autoscaling options for node group %s: %v", nodeGroup.Id(), err)
 			continue
 		}
-		if autoscalingOptions != nil && autoscalingOptions.AtomicScaling {
+		if autoscalingOptions != nil && autoscalingOptions.ZeroOrMaxNodeScaling {
 			if idx, ok := atomicGroup[nodeGroup]; ok {
 				atomic[idx].Nodes = append(atomic[idx].Nodes, node)
 			} else {

diff --git a/cluster-autoscaler/core/scaledown/budgets/budgets_test.go b/cluster-autoscaler/core/scaledown/budgets/budgets_test.go
@@ -361,7 +361,7 @@ var transformNodeGroupView = cmp.Transformer("transformNodeGroupView", func(b No
 func sizedNodeGroup(id string, size int, atomic bool) cloudprovider.NodeGroup {
 	ng := testprovider.NewTestNodeGroup(id, 10000, 0, size, true, false, "n1-standard-2", nil, nil)
 	ng.SetOptions(&config.NodeGroupAutoscalingOptions{
-		AtomicScaling: atomic,
+		ZeroOrMaxNodeScaling: atomic,
 	})
 	return ng
 }

diff --git a/cluster-autoscaler/core/scaledown/planner/planner_test.go b/cluster-autoscaler/core/scaledown/planner/planner_test.go
@@ -795,7 +795,7 @@ func TestNodesToDelete(t *testing.T) {
 func sizedNodeGroup(id string, size int, atomic bool) cloudprovider.NodeGroup {
 	ng := testprovider.NewTestNodeGroup(id, 10000, 0, size, true, false, "n1-standard-2", nil, nil)
 	ng.SetOptions(&config.NodeGroupAutoscalingOptions{
-		AtomicScaling: atomic,
+		ZeroOrMaxNodeScaling: atomic,
 	})
 	return ng
 }

diff --git a/cluster-autoscaler/core/scaleup/orchestrator/orchestrator.go b/cluster-autoscaler/core/scaleup/orchestrator/orchestrator.go
@@ -130,7 +130,7 @@ func (o *ScaleUpOrchestrator) ScaleUp(
 	now := time.Now()
 
 	// Filter out invalid node groups
-	validNodeGroups, skippedNodeGroups := o.filterValidScaleUpNodeGroups(nodeGroups, nodeInfos, resourcesLeft, now)
+	validNodeGroups, skippedNodeGroups := o.filterValidScaleUpNodeGroups(nodeGroups, nodeInfos, resourcesLeft, len(nodes)+len(upcomingNodes), now)
 
 	// Mark skipped node groups as processed.
 	for nodegroupID := range skippedNodeGroups {
@@ -362,7 +362,7 @@ func (o *ScaleUpOrchestrator) ScaleUpToNodeGroupMinSize(
 			continue
 		}
 
-		if skipReason := o.IsNodeGroupResourceExceeded(resourcesLeft, ng, nodeInfo); skipReason != nil {
+		if skipReason := o.IsNodeGroupResourceExceeded(resourcesLeft, ng, nodeInfo, 1); skipReason != nil {
 			klog.Warning("ScaleUpToNodeGroupMinSize: node group resource excceded: %v", skipReason)
 			continue
 		}
@@ -418,6 +418,7 @@ func (o *ScaleUpOrchestrator) filterValidScaleUpNodeGroups(
 	nodeGroups []cloudprovider.NodeGroup,
 	nodeInfos map[string]*schedulerframework.NodeInfo,
 	resourcesLeft resource.Limits,
+	currentNodeCount int,
 	now time.Time,
 ) ([]cloudprovider.NodeGroup, map[string]status.Reasons) {
 	var validNodeGroups []cloudprovider.NodeGroup
@@ -440,14 +441,27 @@ func (o *ScaleUpOrchestrator) filterValidScaleUpNodeGroups(
 			skippedNodeGroups[nodeGroup.Id()] = MaxLimitReachedReason
 			continue
 		}
+		autoscalingOptions, err := nodeGroup.GetOptions(o.autoscalingContext.NodeGroupDefaults)
+		if err != nil {
+			klog.Errorf("Couldn't get autoscaling options for ng: %v", nodeGroup.Id())
+		}
+		numNodes := 1
+		if autoscalingOptions != nil && autoscalingOptions.ZeroOrMaxNodeScaling {
+			numNodes = nodeGroup.MaxSize() - currentTargetSize
+			if o.autoscalingContext.MaxNodesTotal != 0 && currentNodeCount+numNodes > o.autoscalingContext.MaxNodesTotal {
+				klog.V(4).Infof("Skipping node group %s - atomic scale-up exceeds cluster node count limit", nodeGroup.Id())
+				skippedNodeGroups[nodeGroup.Id()] = NewSkippedReasons("atomic scale-up exceeds cluster node count limit")
+				continue
+			}
+		}
 
 		nodeInfo, found := nodeInfos[nodeGroup.Id()]
 		if !found {
 			klog.Errorf("No node info for: %s", nodeGroup.Id())
 			skippedNodeGroups[nodeGroup.Id()] = NotReadyReason
 			continue
 		}
-		if skipReason := o.IsNodeGroupResourceExceeded(resourcesLeft, nodeGroup, nodeInfo); skipReason != nil {
+		if skipReason := o.IsNodeGroupResourceExceeded(resourcesLeft, nodeGroup, nodeInfo, numNodes); skipReason != nil {
 			skippedNodeGroups[nodeGroup.Id()] = skipReason
 			continue
 		}
@@ -476,6 +490,16 @@ func (o *ScaleUpOrchestrator) ComputeExpansionOption(
 	estimator := o.autoscalingContext.EstimatorBuilder(o.autoscalingContext.PredicateChecker, o.autoscalingContext.ClusterSnapshot)
 	option.NodeCount, option.Pods = estimator.Estimate(pods, nodeInfo, nodeGroup)
 	option.SimilarNodeGroups = o.ComputeSimilarNodeGroups(nodeGroup, nodeInfos, schedulablePods, now)
+
+	autoscalingOptions, err := nodeGroup.GetOptions(o.autoscalingContext.NodeGroupDefaults)
+	if err != nil {
+		klog.Errorf("Failed to get autoscaling options for node group %s: %v", nodeGroup.Id(), err)
+	}
+	if autoscalingOptions != nil && autoscalingOptions.ZeroOrMaxNodeScaling {
+		if option.NodeCount > 0 && option.NodeCount != nodeGroup.MaxSize() {
+			option.NodeCount = nodeGroup.MaxSize()
+		}
+	}
 	return option
 }
 
@@ -551,13 +575,17 @@ func (o *ScaleUpOrchestrator) IsNodeGroupReadyToScaleUp(nodeGroup cloudprovider.
 }
 
 // IsNodeGroupResourceExceeded returns nil if node group resource limits are not exceeded, otherwise a reason is provided.
-func (o *ScaleUpOrchestrator) IsNodeGroupResourceExceeded(resourcesLeft resource.Limits, nodeGroup cloudprovider.NodeGroup, nodeInfo *schedulerframework.NodeInfo) status.Reasons {
+func (o *ScaleUpOrchestrator) IsNodeGroupResourceExceeded(resourcesLeft resource.Limits, nodeGroup cloudprovider.NodeGroup, nodeInfo *schedulerframework.NodeInfo, numNodes int) status.Reasons {
 	resourcesDelta, err := o.resourceManager.DeltaForNode(o.autoscalingContext, nodeInfo, nodeGroup)
 	if err != nil {
 		klog.Errorf("Skipping node group %s; error getting node group resources: %v", nodeGroup.Id(), err)
 		return NotReadyReason
 	}
 
+	for resource, delta := range resourcesDelta {
+		resourcesDelta[resource] = delta * int64(numNodes)
+	}
+
 	checkResult := resource.CheckDeltaWithinLimits(resourcesLeft, resourcesDelta)
 	if checkResult.Exceeded {
 		klog.V(4).Infof("Skipping node group %s; maximal limit exceeded for %v", nodeGroup.Id(), checkResult.ExceededResources)
@@ -601,6 +629,14 @@ func (o *ScaleUpOrchestrator) ComputeSimilarNodeGroups(
 		return nil
 	}
 
+	autoscalingOptions, err := nodeGroup.GetOptions(o.autoscalingContext.NodeGroupDefaults)
+	if err != nil {
+		klog.Errorf("Failed to get autoscaling options for node group %s: %v", nodeGroup.Id(), err)
+	}
+	if autoscalingOptions != nil && autoscalingOptions.ZeroOrMaxNodeScaling {
+		return nil
+	}
+
 	groupSchedulablePods, found := schedulablePods[nodeGroup.Id()]
 	if !found || len(groupSchedulablePods) == 0 {
 		return nil
-Original file line number
+Diff line change
@@ Expand Up @@
     		return
     	}
-    	ds.addToBatcher(nodeInfo, nodeGroup, batchSize, drain, opts.AtomicScaling)
+    	ds.addToBatcher(nodeInfo, nodeGroup, batchSize, drain, opts.ZeroOrMaxNodeScaling)
     }
     // prepareNodeForDeletion is a long-running operation, so it needs to avoid locking the AtomicDeletionScheduler object
@@ Expand Down @@