Skip to content

Commit d433400

Browse files
Fix an issue with resuming a failed container cluster creation (#7121) (#13580)
* Fix an issue with resuming a failed container cluster creation and add a test * Do not persist operation when resuming during read. Signed-off-by: Modular Magician <[email protected]> Signed-off-by: Modular Magician <[email protected]>
1 parent 022e24a commit d433400

File tree

4 files changed

+242
-3
lines changed

4 files changed

+242
-3
lines changed

.changelog/7121.txt

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
```release-note:bug
2+
container: fixed an issue with resuming failed cluster creation
3+
```

google/bootstrap_utils_test.go

+160-2
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,10 @@ import (
99
"testing"
1010
"time"
1111

12-
"google.golang.org/api/cloudkms/v1"
12+
"google.golang.org/api/cloudbilling/v1"
13+
cloudkms "google.golang.org/api/cloudkms/v1"
1314
cloudresourcemanager "google.golang.org/api/cloudresourcemanager/v1"
14-
"google.golang.org/api/iam/v1"
15+
iam "google.golang.org/api/iam/v1"
1516
sqladmin "google.golang.org/api/sqladmin/v1beta4"
1617
)
1718

@@ -358,6 +359,163 @@ func BootstrapServicePerimeterProjects(t *testing.T, desiredProjects int) []*clo
358359
return projects
359360
}
360361

362+
func removeContainerServiceAgentRoleFromContainerEngineRobot(t *testing.T, project *cloudresourcemanager.Project) {
363+
config := BootstrapConfig(t)
364+
if config == nil {
365+
return
366+
}
367+
368+
client := config.NewResourceManagerClient(config.userAgent)
369+
containerEngineRobot := fmt.Sprintf("serviceAccount:service-%[email protected]", project.ProjectNumber)
370+
getPolicyRequest := &cloudresourcemanager.GetIamPolicyRequest{}
371+
policy, err := client.Projects.GetIamPolicy(project.ProjectId, getPolicyRequest).Do()
372+
if err != nil {
373+
t.Fatalf("error getting project iam policy: %v", err)
374+
}
375+
roleFound := false
376+
changed := false
377+
for _, binding := range policy.Bindings {
378+
if binding.Role == "roles/container.serviceAgent" {
379+
memberFound := false
380+
for i, member := range binding.Members {
381+
if member == containerEngineRobot {
382+
binding.Members[i] = binding.Members[len(binding.Members)-1]
383+
memberFound = true
384+
}
385+
}
386+
if memberFound {
387+
binding.Members = binding.Members[:len(binding.Members)-1]
388+
changed = true
389+
}
390+
} else if binding.Role == "roles/editor" {
391+
memberFound := false
392+
for _, member := range binding.Members {
393+
if member == containerEngineRobot {
394+
memberFound = true
395+
break
396+
}
397+
}
398+
if !memberFound {
399+
binding.Members = append(binding.Members, containerEngineRobot)
400+
changed = true
401+
}
402+
roleFound = true
403+
}
404+
}
405+
if !roleFound {
406+
policy.Bindings = append(policy.Bindings, &cloudresourcemanager.Binding{
407+
Members: []string{containerEngineRobot},
408+
Role: "roles/editor",
409+
})
410+
changed = true
411+
}
412+
if changed {
413+
setPolicyRequest := &cloudresourcemanager.SetIamPolicyRequest{Policy: policy}
414+
policy, err = client.Projects.SetIamPolicy(project.ProjectId, setPolicyRequest).Do()
415+
if err != nil {
416+
t.Fatalf("error setting project iam policy: %v", err)
417+
}
418+
}
419+
}
420+
421+
func BootstrapProject(t *testing.T, projectID, billingAccount string, services []string) *cloudresourcemanager.Project {
422+
config := BootstrapConfig(t)
423+
if config == nil {
424+
return nil
425+
}
426+
427+
crmClient := config.NewResourceManagerClient(config.userAgent)
428+
429+
project, err := crmClient.Projects.Get(projectID).Do()
430+
if err != nil {
431+
if !isGoogleApiErrorWithCode(err, 403) {
432+
t.Fatalf("Error getting bootstrapped project: %s", err)
433+
}
434+
org := getTestOrgFromEnv(t)
435+
436+
op, err := crmClient.Projects.Create(&cloudresourcemanager.Project{
437+
ProjectId: projectID,
438+
Name: "Bootstrapped Test Project",
439+
Parent: &cloudresourcemanager.ResourceId{
440+
Type: "organization",
441+
Id: org,
442+
},
443+
}).Do()
444+
if err != nil {
445+
t.Fatalf("Error creating bootstrapped test project: %s", err)
446+
}
447+
448+
opAsMap, err := ConvertToMap(op)
449+
if err != nil {
450+
t.Fatalf("Error converting create project operation to map: %s", err)
451+
}
452+
453+
err = resourceManagerOperationWaitTime(config, opAsMap, "creating project", config.userAgent, 4*time.Minute)
454+
if err != nil {
455+
t.Fatalf("Error waiting for create project operation: %s", err)
456+
}
457+
458+
project, err = crmClient.Projects.Get(projectID).Do()
459+
if err != nil {
460+
t.Fatalf("Error getting bootstrapped project: %s", err)
461+
}
462+
463+
}
464+
465+
if project.LifecycleState == "DELETE_REQUESTED" {
466+
_, err := crmClient.Projects.Undelete(projectID, &cloudresourcemanager.UndeleteProjectRequest{}).Do()
467+
if err != nil {
468+
t.Fatalf("Error undeleting bootstrapped project: %s", err)
469+
}
470+
}
471+
472+
if billingAccount != "" {
473+
billingClient := config.NewBillingClient(config.userAgent)
474+
var pbi *cloudbilling.ProjectBillingInfo
475+
err = retryTimeDuration(func() error {
476+
var reqErr error
477+
pbi, reqErr = billingClient.Projects.GetBillingInfo(prefixedProject(projectID)).Do()
478+
return reqErr
479+
}, 30*time.Second)
480+
if err != nil {
481+
t.Fatalf("Error getting billing info for project %q: %v", projectID, err)
482+
}
483+
if strings.TrimPrefix(pbi.BillingAccountName, "billingAccounts/") != billingAccount {
484+
pbi.BillingAccountName = "billingAccounts/" + billingAccount
485+
err := retryTimeDuration(func() error {
486+
_, err := config.NewBillingClient(config.userAgent).Projects.UpdateBillingInfo(prefixedProject(projectID), pbi).Do()
487+
return err
488+
}, 2*time.Minute)
489+
if err != nil {
490+
t.Fatalf("Error setting billing account for project %q to %q: %s", projectID, billingAccount, err)
491+
}
492+
}
493+
}
494+
495+
if len(services) > 0 {
496+
497+
enabledServices, err := listCurrentlyEnabledServices(projectID, "", config.userAgent, config, 1*time.Minute)
498+
if err != nil {
499+
t.Fatalf("Error listing services for project %q: %s", projectID, err)
500+
}
501+
502+
servicesToEnable := make([]string, 0, len(services))
503+
for _, service := range services {
504+
if _, ok := enabledServices[service]; !ok {
505+
servicesToEnable = append(servicesToEnable, service)
506+
}
507+
}
508+
509+
if len(servicesToEnable) > 0 {
510+
if err := enableServiceUsageProjectServices(servicesToEnable, projectID, "", config.userAgent, config, 10*time.Minute); err != nil {
511+
t.Fatalf("Error enabling services for project %q: %s", projectID, err)
512+
}
513+
}
514+
}
515+
516+
return project
517+
}
518+
361519
func BootstrapConfig(t *testing.T) *Config {
362520
if v := os.Getenv("TF_ACC"); v == "" {
363521
t.Skip("Acceptance tests and bootstrapping skipped unless env 'TF_ACC' set")

google/resource_container_cluster.go

+21-1
Original file line numberDiff line numberDiff line change
@@ -2003,6 +2003,8 @@ func resourceContainerClusterRead(d *schema.ResourceData, meta interface{}) erro
20032003
return err
20042004
}
20052005

2006+
clusterName := d.Get("name").(string)
2007+
20062008
operation := d.Get("operation").(string)
20072009
if operation != "" {
20082010
log.Printf("[DEBUG] in progress operation detected at %v, attempting to resume", operation)
@@ -2014,11 +2016,29 @@ func resourceContainerClusterRead(d *schema.ResourceData, meta interface{}) erro
20142016
}
20152017
waitErr := containerOperationWait(config, op, project, location, "resuming GKE cluster", userAgent, d.Timeout(schema.TimeoutRead))
20162018
if waitErr != nil {
2019+
// Try a GET on the cluster so we can see the state in debug logs. This will help classify error states.
2020+
clusterGetCall := config.NewContainerClient(userAgent).Projects.Locations.Clusters.Get(containerClusterFullName(project, location, clusterName))
2021+
if config.UserProjectOverride {
2022+
clusterGetCall.Header().Add("X-Goog-User-Project", project)
2023+
}
2024+
_, getErr := clusterGetCall.Do()
2025+
if getErr != nil {
2026+
log.Printf("[WARN] Cluster %s was created in an error state and not found", clusterName)
2027+
d.SetId("")
2028+
}
2029+
2030+
if deleteErr := cleanFailedContainerCluster(d, meta); deleteErr != nil {
2031+
log.Printf("[WARN] Unable to clean up cluster from failed creation: %s", deleteErr)
2032+
// Leave ID set as the cluster likely still exists and should not be removed from state yet.
2033+
} else {
2034+
log.Printf("[WARN] Verified failed creation of cluster %s was cleaned up", d.Id())
2035+
d.SetId("")
2036+
}
2037+
// The resource didn't actually create
20172038
return waitErr
20182039
}
20192040
}
20202041

2021-
clusterName := d.Get("name").(string)
20222042
name := containerClusterFullName(project, location, clusterName)
20232043
clusterGetCall := config.NewContainerClient(userAgent).Projects.Locations.Clusters.Get(name)
20242044
if config.UserProjectOverride {

google/resource_container_cluster_test.go

+58
Original file line numberDiff line numberDiff line change
@@ -3795,6 +3795,32 @@ func TestAccContainerCluster_withEnablePrivateEndpointToggle(t *testing.T) {
37953795
})
37963796
}
37973797

3798+
func TestAccContainerCluster_failedCreation(t *testing.T) {
3799+
// Test that in a scenario where the cluster fails to create, a subsequent apply will delete the resource.
3800+
t.Parallel()
3801+
3802+
clusterName := fmt.Sprintf("tf-test-cluster-%s", randString(t, 10))
3803+
3804+
project := BootstrapProject(t, "tf-fail-cluster-test", getTestBillingAccountFromEnv(t), []string{"container.googleapis.com"})
3805+
removeContainerServiceAgentRoleFromContainerEngineRobot(t, project)
3806+
3807+
vcrTest(t, resource.TestCase{
3808+
PreCheck: func() { testAccPreCheck(t) },
3809+
Providers: testAccProviders,
3810+
Steps: []resource.TestStep{
3811+
{
3812+
Config: testAccContainerCluster_failedCreation(clusterName, project.ProjectId),
3813+
ExpectError: regexp.MustCompile("timeout while waiting for state to become 'DONE'"),
3814+
},
3815+
{
3816+
Config: testAccContainerCluster_failedCreation_update(clusterName, project.ProjectId),
3817+
ExpectError: regexp.MustCompile("Failed to create cluster"),
3818+
Check: testAccCheckContainerClusterDestroyProducer(t),
3819+
},
3820+
},
3821+
})
3822+
}
3823+
37983824
func testAccContainerCluster_withEnablePrivateEndpoint(clusterName string, flag string) string {
37993825

38003826
return fmt.Sprintf(`
@@ -6133,3 +6159,35 @@ resource "google_container_cluster" "primary" {
61336159
}
61346160
`, name, name, name)
61356161
}
6162+
6163+
func testAccContainerCluster_failedCreation(cluster, project string) string {
6164+
return fmt.Sprintf(`
6165+
resource "google_container_cluster" "primary" {
6166+
name = "%s"
6167+
project = "%s"
6168+
location = "us-central1-a"
6169+
initial_node_count = 1
6170+
6171+
workload_identity_config {
6172+
workload_pool = "%s.svc.id.goog"
6173+
}
6174+
6175+
timeouts {
6176+
create = "40s"
6177+
}
6178+
}`, cluster, project, project)
6179+
}
6180+
6181+
func testAccContainerCluster_failedCreation_update(cluster, project string) string {
6182+
return fmt.Sprintf(`
6183+
resource "google_container_cluster" "primary" {
6184+
name = "%s"
6185+
project = "%s"
6186+
location = "us-central1-a"
6187+
initial_node_count = 1
6188+
6189+
workload_identity_config {
6190+
workload_pool = "%s.svc.id.goog"
6191+
}
6192+
}`, cluster, project, project)
6193+
}

0 commit comments

Comments
 (0)