Skip to content

Commit 68dc6cc

Browse files
Node pool operations should retry if they encountered quota error. (#8828) (#15820)
Signed-off-by: Modular Magician <[email protected]>
1 parent c0c6796 commit 68dc6cc

File tree

3 files changed

+56
-34
lines changed

3 files changed

+56
-34
lines changed

.changelog/8828.txt

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
```release-note:bug
2+
container: fixed concurrent ops' quota-error to be retriable in `google_container_node_pool `
3+
```

google/services/container/resource_container_node_pool.go

+39-17
Original file line numberDiff line numberDiff line change
@@ -517,9 +517,11 @@ func resourceContainerNodePoolCreate(d *schema.ResourceData, meta interface{}) e
517517
operation, err = clusterNodePoolsCreateCall.Do()
518518

519519
if err != nil {
520-
if tpgresource.IsFailedPreconditionError(err) {
520+
if tpgresource.IsFailedPreconditionError(err) || tpgresource.IsQuotaError(err) {
521521
// We get failed precondition errors if the cluster is updating
522522
// while we try to add the node pool.
523+
// We get quota errors if there the number of running concurrent
524+
// operations reaches the quota.
523525
return resource.RetryableError(err)
524526
}
525527
return resource.NonRetryableError(err)
@@ -722,9 +724,11 @@ func resourceContainerNodePoolDelete(d *schema.ResourceData, meta interface{}) e
722724
operation, err = clusterNodePoolsDeleteCall.Do()
723725

724726
if err != nil {
725-
if tpgresource.IsFailedPreconditionError(err) {
727+
if tpgresource.IsFailedPreconditionError(err) || tpgresource.IsQuotaError(err) {
726728
// We get failed precondition errors if the cluster is updating
727729
// while we try to delete the node pool.
730+
// We get quota errors if there the number of running concurrent
731+
// operations reaches the quota.
728732
return resource.RetryableError(err)
729733
}
730734
return resource.NonRetryableError(err)
@@ -1202,7 +1206,7 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node
12021206
timeout)
12031207
}
12041208

1205-
if err := tpgresource.RetryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
1209+
if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
12061210
return err
12071211
}
12081212
log.Printf("[INFO] Updated autoscaling in Node Pool %s", d.Id())
@@ -1240,7 +1244,7 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node
12401244
timeout)
12411245
}
12421246

1243-
if err := tpgresource.RetryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
1247+
if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
12441248
return err
12451249
}
12461250

@@ -1294,7 +1298,7 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node
12941298
timeout)
12951299
}
12961300

1297-
if err := tpgresource.RetryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
1301+
if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
12981302
return err
12991303
}
13001304
log.Printf("[INFO] Updated tags for node pool %s", name)
@@ -1331,7 +1335,7 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node
13311335
}
13321336

13331337
// Call update serially.
1334-
if err := tpgresource.RetryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
1338+
if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
13351339
return err
13361340
}
13371341

@@ -1369,7 +1373,7 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node
13691373
}
13701374

13711375
// Call update serially.
1372-
if err := tpgresource.RetryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
1376+
if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
13731377
return err
13741378
}
13751379

@@ -1401,7 +1405,7 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node
14011405
timeout)
14021406
}
14031407

1404-
if err := tpgresource.RetryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
1408+
if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
14051409
return err
14061410
}
14071411
log.Printf("[INFO] Updated image type in Node Pool %s", d.Id())
@@ -1435,7 +1439,7 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node
14351439
timeout)
14361440
}
14371441

1438-
if err := tpgresource.RetryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
1442+
if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
14391443
return err
14401444
}
14411445
log.Printf("[INFO] Updated workload_metadata_config for node pool %s", name)
@@ -1468,7 +1472,7 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node
14681472
timeout)
14691473
}
14701474

1471-
if err := tpgresource.RetryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
1475+
if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
14721476
return err
14731477
}
14741478

@@ -1501,7 +1505,7 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node
15011505
timeout)
15021506
}
15031507

1504-
if err := tpgresource.RetryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
1508+
if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
15051509
return err
15061510
}
15071511

@@ -1532,7 +1536,7 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node
15321536
nodePoolInfo.location, "updating GKE node pool size", userAgent,
15331537
timeout)
15341538
}
1535-
if err := tpgresource.RetryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
1539+
if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
15361540
return err
15371541
}
15381542
log.Printf("[INFO] GKE node pool %s size has been updated to %d", name, newSize)
@@ -1567,7 +1571,7 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node
15671571
nodePoolInfo.location, "updating GKE node pool management", userAgent, timeout)
15681572
}
15691573

1570-
if err := tpgresource.RetryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
1574+
if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
15711575
return err
15721576
}
15731577
log.Printf("[INFO] Updated management in Node Pool %s", name)
@@ -1594,7 +1598,7 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node
15941598
nodePoolInfo.project,
15951599
nodePoolInfo.location, "updating GKE node pool version", userAgent, timeout)
15961600
}
1597-
if err := tpgresource.RetryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
1601+
if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
15981602
return err
15991603
}
16001604
log.Printf("[INFO] Updated version in Node Pool %s", name)
@@ -1619,7 +1623,7 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node
16191623
return ContainerOperationWait(config, op, nodePoolInfo.project, nodePoolInfo.location, "updating GKE node pool node locations", userAgent, timeout)
16201624
}
16211625

1622-
if err := tpgresource.RetryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
1626+
if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
16231627
return err
16241628
}
16251629
log.Printf("[INFO] Updated node locations in Node Pool %s", name)
@@ -1699,7 +1703,7 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node
16991703
// Wait until it's updated
17001704
return ContainerOperationWait(config, op, nodePoolInfo.project, nodePoolInfo.location, "updating GKE node pool upgrade settings", userAgent, timeout)
17011705
}
1702-
if err := tpgresource.RetryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
1706+
if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
17031707
return err
17041708
}
17051709
log.Printf("[INFO] Updated upgrade settings in Node Pool %s", name)
@@ -1730,7 +1734,7 @@ func nodePoolUpdate(d *schema.ResourceData, meta interface{}, nodePoolInfo *Node
17301734
timeout)
17311735
}
17321736

1733-
if err := tpgresource.RetryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
1737+
if err := retryWhileIncompatibleOperation(timeout, npLockKey, updateF); err != nil {
17341738
return err
17351739
}
17361740

@@ -1781,3 +1785,21 @@ func containerNodePoolAwaitRestingState(config *transport_tpg.Config, name, proj
17811785

17821786
return state, err
17831787
}
1788+
1789+
// Retries an operation while the canonical error code is FAILED_PRECONDTION
1790+
// or RESOURCE_EXHAUSTED which indicates there is an incompatible operation
1791+
// already running on the cluster or there are the number of allowed
1792+
// concurrent operations running on the cluster. These errors can be safely
1793+
// retried until the incompatible operation completes, and the newly
1794+
// requested operation can begin.
1795+
func retryWhileIncompatibleOperation(timeout time.Duration, lockKey string, f func() error) error {
1796+
return resource.Retry(timeout, func() *resource.RetryError {
1797+
if err := transport_tpg.LockedCall(lockKey, f); err != nil {
1798+
if tpgresource.IsFailedPreconditionError(err) || tpgresource.IsQuotaError(err) {
1799+
return resource.RetryableError(err)
1800+
}
1801+
return resource.NonRetryableError(err)
1802+
}
1803+
return nil
1804+
})
1805+
}

google/tpgresource/utils.go

+14-17
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ import (
2222
"github.com/hashicorp/errwrap"
2323
fwDiags "github.com/hashicorp/terraform-plugin-framework/diag"
2424
"github.com/hashicorp/terraform-plugin-sdk/v2/diag"
25-
"github.com/hashicorp/terraform-plugin-sdk/v2/helper/resource"
2625
"github.com/hashicorp/terraform-plugin-sdk/v2/helper/schema"
2726
"github.com/hashicorp/terraform-plugin-sdk/v2/terraform"
2827
"google.golang.org/api/googleapi"
@@ -128,6 +127,20 @@ func IsFailedPreconditionError(err error) bool {
128127
return false
129128
}
130129

130+
func IsQuotaError(err error) bool {
131+
gerr, ok := errwrap.GetType(err, &googleapi.Error{}).(*googleapi.Error)
132+
if !ok {
133+
return false
134+
}
135+
if gerr == nil {
136+
return false
137+
}
138+
if gerr.Code != 429 {
139+
return false
140+
}
141+
return true
142+
}
143+
131144
func IsConflictError(err error) bool {
132145
if e, ok := err.(*googleapi.Error); ok && (e.Code == 409 || e.Code == 412) {
133146
return true
@@ -503,22 +516,6 @@ func CheckGoogleIamPolicy(value string) error {
503516
return nil
504517
}
505518

506-
// Retries an operation while the canonical error code is FAILED_PRECONDTION
507-
// which indicates there is an incompatible operation already running on the
508-
// cluster. This error can be safely retried until the incompatible operation
509-
// completes, and the newly requested operation can begin.
510-
func RetryWhileIncompatibleOperation(timeout time.Duration, lockKey string, f func() error) error {
511-
return resource.Retry(timeout, func() *resource.RetryError {
512-
if err := transport_tpg.LockedCall(lockKey, f); err != nil {
513-
if IsFailedPreconditionError(err) {
514-
return resource.RetryableError(err)
515-
}
516-
return resource.NonRetryableError(err)
517-
}
518-
return nil
519-
})
520-
}
521-
522519
func FrameworkDiagsToSdkDiags(fwD fwDiags.Diagnostics) *diag.Diagnostics {
523520
var diags diag.Diagnostics
524521
for _, e := range fwD.Errors() {

0 commit comments

Comments
 (0)