Add skip_wait_on_job_termination option for dataflow job resources (#5844) (#11452)

modular-magician · web-flow · commit 81164c3f2876 · 2022-04-08T10:43:28.000-05:00
Signed-off-by: Modular Magician &lt;magic-modules@google.com&gt;
diff --git a/.changelog/5844.txt b/.changelog/5844.txt
@@ -0,0 +1,3 @@
+```release-note:enhancement
+dataflow: added `skip_wait_on_job_termination` attribute to `google_dataflow_job` and `google_dataflow_flex_template_job` resources (issue #10559)
+```
diff --git a/google/resource_dataflow_job.go b/google/resource_dataflow_job.go
@@ -18,6 +18,11 @@ import (
 
 const resourceDataflowJobGoogleProvidedLabelPrefix = "labels.goog-dataflow-provided"
 
+var dataflowTerminatingStatesMap = map[string]struct{}{
+	"JOB_STATE_CANCELLING": {},
+	"JOB_STATE_DRAINING":   {},
+}
+
 var dataflowTerminalStatesMap = map[string]struct{}{
 	"JOB_STATE_DONE":      {},
 	"JOB_STATE_FAILED":    {},
@@ -204,6 +209,13 @@ func resourceDataflowJob() *schema.Resource {
 				Optional:    true,
 				Description: `Indicates if the job should use the streaming engine feature.`,
 			},
+
+			"skip_wait_on_job_termination": {
+				Type:        schema.TypeBool,
+				Optional:    true,
+				Default:     false,
+				Description: `If true, treat DRAINING and CANCELLING as terminal job states and do not wait for further changes before removing from terraform state and moving on. WARNING: this will lead to job name conflicts if you do not ensure that the job names are different, e.g. by embedding a release ID or by using a random_id.`,
+			},
 		},
 		UseJSONNumber: true,
 	}
@@ -233,6 +245,16 @@ func resourceDataflowJobTypeCustomizeDiff(_ context.Context, d *schema.ResourceD
 	return nil
 }
 
+// return true if a job is in a terminal state, OR if a job is in a
+// terminating state and skipWait is true
+func shouldStopDataflowJobDeleteQuery(state string, skipWait bool) bool {
+	_, stopQuery := dataflowTerminalStatesMap[state]
+	if !stopQuery && skipWait {
+		_, stopQuery = dataflowTerminatingStatesMap[state]
+	}
+	return stopQuery
+}
+
 func resourceDataflowJobCreate(d *schema.ResourceData, meta interface{}) error {
 	config := meta.(*Config)
 	userAgent, err := generateUserAgentString(d, config.userAgent)
@@ -343,7 +365,7 @@ func resourceDataflowJobRead(d *schema.ResourceData, meta interface{}) error {
 		return fmt.Errorf("Error setting additional_experiments: %s", err)
 	}
 
-	if _, ok := dataflowTerminalStatesMap[job.CurrentState]; ok {
+	if ok := shouldStopDataflowJobDeleteQuery(job.CurrentState, d.Get("skip_wait_on_job_termination").(bool)); ok {
 		log.Printf("[DEBUG] Removing resource '%s' because it is in state %s.\n", job.Name, job.CurrentState)
 		d.SetId("")
 		return nil
@@ -469,8 +491,9 @@ func resourceDataflowJobDelete(d *schema.ResourceData, meta interface{}) error {
 		return err
 	}
 
-	// Wait for state to reach terminal state (canceled/drained/done)
-	_, ok := dataflowTerminalStatesMap[d.Get("state").(string)]
+	// Wait for state to reach terminal state (canceled/drained/done plus cancelling/draining if skipWait)
+	skipWait := d.Get("skip_wait_on_job_termination").(bool)
+	ok := shouldStopDataflowJobDeleteQuery(d.Get("state").(string), skipWait)
 	for !ok {
 		log.Printf("[DEBUG] Waiting for job with job state %q to terminate...", d.Get("state").(string))
 		time.Sleep(5 * time.Second)
@@ -479,11 +502,11 @@ func resourceDataflowJobDelete(d *schema.ResourceData, meta interface{}) error {
 		if err != nil {
 			return fmt.Errorf("Error while reading job to see if it was properly terminated: %v", err)
 		}
-		_, ok = dataflowTerminalStatesMap[d.Get("state").(string)]
+		ok = shouldStopDataflowJobDeleteQuery(d.Get("state").(string), skipWait)
 	}
 
-	// Only remove the job from state if it's actually successfully canceled.
-	if _, ok := dataflowTerminalStatesMap[d.Get("state").(string)]; ok {
+	// Only remove the job from state if it's actually successfully hit a final state.
+	if ok = shouldStopDataflowJobDeleteQuery(d.Get("state").(string), skipWait); ok {
 		log.Printf("[DEBUG] Removing dataflow job with final state %q", d.Get("state").(string))
 		d.SetId("")
 		return nil
diff --git a/google/resource_dataflow_job_test.go b/google/resource_dataflow_job_test.go
@@ -2,6 +2,7 @@ package google
 
 import (
 	"fmt"
+	"strconv"
 	"strings"
 	"testing"
 	"time"
@@ -44,6 +45,32 @@ func TestAccDataflowJob_basic(t *testing.T) {
 	})
 }
 
+func TestAccDataflowJobSkipWait_basic(t *testing.T) {
+	// Dataflow responses include serialized java classes and bash commands
+	// This makes body comparison infeasible
+	skipIfVcr(t)
+	t.Parallel()
+
+	randStr := randString(t, 10)
+	bucket := "tf-test-dataflow-gcs-" + randStr
+	job := "tf-test-dataflow-job-" + randStr
+	zone := "us-central1-f"
+
+	vcrTest(t, resource.TestCase{
+		PreCheck:     func() { testAccPreCheck(t) },
+		Providers:    testAccProviders,
+		CheckDestroy: testAccCheckDataflowJobDestroyProducer(t),
+		Steps: []resource.TestStep{
+			{
+				Config: testAccDataflowJobSkipWait_zone(bucket, job, zone),
+				Check: resource.ComposeTestCheckFunc(
+					testAccDataflowJobExists(t, "google_dataflow_job.big_data"),
+				),
+			},
+		},
+	})
+}
+
 func TestAccDataflowJob_withRegion(t *testing.T) {
 	// Dataflow responses include serialized java classes and bash commands
 	// This makes body comparison infeasible
@@ -329,7 +356,16 @@ func testAccCheckDataflowJobDestroyProducer(t *testing.T) func(s *terraform.Stat
 			config := googleProviderConfig(t)
 			job, err := config.NewDataflowClient(config.userAgent).Projects.Jobs.Get(config.Project, rs.Primary.ID).Do()
 			if job != nil {
-				if _, ok := dataflowTerminalStatesMap[job.CurrentState]; !ok {
+				var ok bool
+				skipWait, err := strconv.ParseBool(rs.Primary.Attributes["skip_wait_on_job_termination"])
+				if err != nil {
+					return fmt.Errorf("could not parse attribute: %v", err)
+				}
+				_, ok = dataflowTerminalStatesMap[job.CurrentState]
+				if !ok && skipWait {
+					_, ok = dataflowTerminatingStatesMap[job.CurrentState]
+				}
+				if !ok {
 					return fmt.Errorf("Job still present")
 				}
 			} else if err != nil {
@@ -351,7 +387,16 @@ func testAccCheckDataflowJobRegionDestroyProducer(t *testing.T) func(s *terrafor
 			config := googleProviderConfig(t)
 			job, err := config.NewDataflowClient(config.userAgent).Projects.Locations.Jobs.Get(config.Project, "us-central1", rs.Primary.ID).Do()
 			if job != nil {
-				if _, ok := dataflowTerminalStatesMap[job.CurrentState]; !ok {
+				var ok bool
+				skipWait, err := strconv.ParseBool(rs.Primary.Attributes["skip_wait_on_job_termination"])
+				if err != nil {
+					return fmt.Errorf("could not parse attribute: %v", err)
+				}
+				_, ok = dataflowTerminalStatesMap[job.CurrentState]
+				if !ok && skipWait {
+					_, ok = dataflowTerminatingStatesMap[job.CurrentState]
+				}
+				if !ok {
 					return fmt.Errorf("Job still present")
 				}
 			} else if err != nil {
@@ -635,6 +680,32 @@ resource "google_dataflow_job" "big_data" {
 `, bucket, job, zone, testDataflowJobTemplateWordCountUrl, testDataflowJobSampleFileUrl)
 }
 
+func testAccDataflowJobSkipWait_zone(bucket, job, zone string) string {
+	return fmt.Sprintf(`
+resource "google_storage_bucket" "temp" {
+  name          = "%s"
+  location      = "US"
+  force_destroy = true
+}
+
+resource "google_dataflow_job" "big_data" {
+  name = "%s"
+ 
+  zone    = "%s"
+
+  machine_type      = "e2-standard-2"
+  template_gcs_path = "%s"
+  temp_gcs_location = google_storage_bucket.temp.url
+  parameters = {
+    inputFile = "%s"
+    output    = "${google_storage_bucket.temp.url}/output"
+  }
+  on_delete                    = "cancel"
+  skip_wait_on_job_termination = true
+}
+`, bucket, job, zone, testDataflowJobTemplateWordCountUrl, testDataflowJobSampleFileUrl)
+}
+
 func testAccDataflowJob_region(bucket, job string) string {
 	return fmt.Sprintf(`
 resource "google_storage_bucket" "temp" {
diff --git a/website/docs/r/dataflow_flex_template_job.html.markdown b/website/docs/r/dataflow_flex_template_job.html.markdown
@@ -48,6 +48,38 @@ is "cancelled", but if a user sets `on_delete` to `"drain"` in the
 configuration, you may experience a long wait for your `terraform destroy` to
 complete.
 
+You can potentially short-circuit the wait by setting `skip_wait_for_job_termination`
+to `true`, but beware that unless you take active steps to ensure that the job
+`name` parameter changes between instances, the name will conflict and the launch
+of the new job will fail. One way to do this is with a
+[random_id](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/id)
+resource, for example:
+
+```hcl
+variable "big_data_job_subscription_id" {
+  type    = string
+  default = "projects/myproject/subscriptions/messages"
+}
+
+resource "random_id" "big_data_job_name_suffix" {
+  byte_length = 4
+  keepers = {
+    region          = var.region
+    subscription_id = var.big_data_job_subscription_id
+  }
+}
+resource "google_dataflow_flex_template_job" "big_data_job" {
+  provider                      = google-beta
+  name                          = "dataflow-flextemplates-job-${random_id.big_data_job_name_suffix.dec}"
+  region                        = var.region
+  container_spec_gcs_path       = "gs://my-bucket/templates/template.json"
+  skip_wait_for_job_termination = true
+  parameters = {
+    inputSubscription = var.big_data_job_subscription_id
+  }
+}
+```
+
 ## Argument Reference
 
 The following arguments are supported:
@@ -74,6 +106,10 @@ labels will be ignored to prevent diffs on re-apply.
 * `on_delete` - (Optional) One of "drain" or "cancel". Specifies behavior of
 deletion during `terraform destroy`.  See above note.
 
+* `skip_wait_for_job_termination` - (Optional)  If set to `true`, terraform will
+treat `DRAINING` and `CANCELLING` as terminal states when deleting the resource,
+and will remove the resource from terraform state and move on.  See above note.
+
 * `project` - (Optional) The project in which the resource belongs. If it is not
 provided, the provider project is used.
 
diff --git a/website/docs/r/dataflow_job.html.markdown b/website/docs/r/dataflow_job.html.markdown
@@ -65,6 +65,33 @@ The Dataflow resource is considered 'existing' while it is in a nonterminal stat
 
 A Dataflow job which is 'destroyed' may be "cancelled" or "drained".  If "cancelled", the job terminates - any data written remains where it is, but no new data will be processed.  If "drained", no new data will enter the pipeline, but any data currently in the pipeline will finish being processed.  The default is "drain". When `on_delete` is set to `"drain"` in the configuration, you may experience a long wait for your `terraform destroy` to complete.
 
+You can potentially short-circuit the wait by setting `skip_wait_for_job_termination` to `true`, but beware that unless you take active steps to ensure that the job `name` parameter changes between instances, the name will conflict and the launch of the new job will fail. One way to do this is with a [random_id](https://registry.terraform.io/providers/hashicorp/random/latest/docs/resources/id) resource, for example:
+
+```hcl
+variable "big_data_job_subscription_id" {
+  type    = string
+  default = "projects/myproject/subscriptions/messages"
+}
+
+resource "random_id" "big_data_job_name_suffix" {
+  byte_length = 4
+  keepers = {
+    region          = var.region
+    subscription_id = var.big_data_job_subscription_id
+  }
+}
+resource "google_dataflow_flex_template_job" "big_data_job" {
+  provider                      = google-beta
+  name                          = "dataflow-flextemplates-job-${random_id.big_data_job_name_suffix.dec}"
+  region                        = var.region
+  container_spec_gcs_path       = "gs://my-bucket/templates/template.json"
+  skip_wait_for_job_termination = true
+  parameters = {
+    inputSubscription = var.big_data_job_subscription_id
+  }
+}
+```
+
 ## Argument Reference
 
 The following arguments are supported:
@@ -83,6 +110,7 @@ The following arguments are supported:
 * `transform_name_mapping` - (Optional) Only applicable when updating a pipeline. Map of transform name prefixes of the job to be replaced with the corresponding name prefixes of the new job. This field is not used outside of update.
 * `max_workers` - (Optional) The number of workers permitted to work on the job.  More workers may improve processing speed at additional cost.
 * `on_delete` - (Optional) One of "drain" or "cancel".  Specifies behavior of deletion during `terraform destroy`.  See above note.
+* `skip_wait_for_job_termination` - (Optional)  If set to `true`, terraform will treat `DRAINING` and `CANCELLING` as terminal states when deleting the resource, and will remove the resource from terraform state and move on.  See above note.
 * `project` - (Optional) The project in which the resource belongs. If it is not provided, the provider project is used.
 * `zone` - (Optional) The zone in which the created job should run. If it is not provided, the provider zone is used.
 * `region` - (Optional) The region in which the created job should run.

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	+```release-note:enhancement
	`2`	+dataflow: added `skip_wait_on_job_termination` attribute to `google_dataflow_job` and `google_dataflow_flex_template_job` resources (issue #10559)
	`3`	+```