feat: Add support for gpu_partition_size (#1072)

stanley98yu · web-flow · commit cff1c1be2658 · 2021-11-22T20:49:58.000-05:00
* Add support for gpu_partition_size

* Add test for gpu_partition_size

* Lint formatting fix
diff --git a/README.md b/README.md
@@ -236,6 +236,7 @@ The node_pools variable takes the following parameters:
 | effect | Effect for the taint | | Required |
 | enable_integrity_monitoring | Enables monitoring and attestation of the boot integrity of the instance. The attestation is performed against the integrity policy baseline. This baseline is initially derived from the implicitly trusted boot image when the instance is created. | true | Optional |
 | enable_secure_boot | Secure Boot helps ensure that the system only runs authentic software by verifying the digital signature of all boot components, and halting the boot process if signature verification fails. | false | Optional |
+| gpu_partition_size | Size of partitions to create on the GPU | null | Optional |
 | image_type | The image type to use for this node. Note that changing the image type will delete and recreate all nodes in the node pool | COS | Optional |
 | initial_node_count | The initial number of nodes for the pool. In regional or multi-zonal clusters, this is the number of nodes per zone. Changing this will force recreation of the resource. Defaults to the value of min_count | " " | Optional |
 | key | The key required for the taint | | Required |
diff --git a/autogen/main/README.md b/autogen/main/README.md
@@ -186,6 +186,7 @@ The node_pools variable takes the following parameters:
 | effect | Effect for the taint | | Required |
 | enable_integrity_monitoring | Enables monitoring and attestation of the boot integrity of the instance. The attestation is performed against the integrity policy baseline. This baseline is initially derived from the implicitly trusted boot image when the instance is created. | true | Optional |
 | enable_secure_boot | Secure Boot helps ensure that the system only runs authentic software by verifying the digital signature of all boot components, and halting the boot process if signature verification fails. | false | Optional |
+| gpu_partition_size | Size of partitions to create on the GPU | null | Optional |
 | image_type | The image type to use for this node. Note that changing the image type will delete and recreate all nodes in the node pool | COS | Optional |
 | initial_node_count | The initial number of nodes for the pool. In regional or multi-zonal clusters, this is the number of nodes per zone. Changing this will force recreation of the resource. Defaults to the value of min_count | " " | Optional |
 | key | The key required for the taint | | Required |
diff --git a/autogen/main/cluster.tf.tmpl b/autogen/main/cluster.tf.tmpl
@@ -598,9 +598,11 @@ resource "google_container_node_pool" "pools" {
       for guest_accelerator in lookup(each.value, "accelerator_count", 0) > 0 ? [{
         type  = lookup(each.value, "accelerator_type", "")
         count = lookup(each.value, "accelerator_count", 0)
+	gpu_partition_size = lookup(each.value, "gpu_partition_size", null)
         }] : [] : {
         type  = guest_accelerator["type"]
         count = guest_accelerator["count"]
+	gpu_partition_size = guest_accelerator["gpu_partition_size"]
       }
     ]
 
diff --git a/cluster.tf b/cluster.tf
@@ -308,11 +308,13 @@ resource "google_container_node_pool" "pools" {
 
     guest_accelerator = [
       for guest_accelerator in lookup(each.value, "accelerator_count", 0) > 0 ? [{
-        type  = lookup(each.value, "accelerator_type", "")
-        count = lookup(each.value, "accelerator_count", 0)
+        type               = lookup(each.value, "accelerator_type", "")
+        count              = lookup(each.value, "accelerator_count", 0)
+        gpu_partition_size = lookup(each.value, "gpu_partition_size", null)
         }] : [] : {
-        type  = guest_accelerator["type"]
-        count = guest_accelerator["count"]
+        type               = guest_accelerator["type"]
+        count              = guest_accelerator["count"]
+        gpu_partition_size = guest_accelerator["gpu_partition_size"]
       }
     ]
 
diff --git a/examples/node_pool/main.tf b/examples/node_pool/main.tf
@@ -19,7 +19,7 @@ locals {
 }
 
 provider "google-beta" {
-  version = "~> 3.79.0"
+  version = "~> 3.90.0"
   region  = var.region
 }
 
@@ -55,18 +55,19 @@ module "gke" {
       auto_upgrade    = true
     },
     {
-      name              = "pool-02"
-      machine_type      = "n1-standard-2"
-      min_count         = 1
-      max_count         = 2
-      local_ssd_count   = 0
-      disk_size_gb      = 30
-      disk_type         = "pd-standard"
-      accelerator_count = 1
-      accelerator_type  = "nvidia-tesla-p4"
-      image_type        = "COS"
-      auto_repair       = false
-      service_account   = var.compute_engine_service_account
+      name               = "pool-02"
+      machine_type       = "a2-highgpu-1g"
+      min_count          = 1
+      max_count          = 2
+      local_ssd_count    = 0
+      disk_size_gb       = 30
+      disk_type          = "pd-standard"
+      accelerator_count  = 1
+      accelerator_type   = "nvidia-tesla-a100"
+      gpu_partition_size = "1g.5gb"
+      image_type         = "COS"
+      auto_repair        = false
+      service_account    = var.compute_engine_service_account
     },
     {
       name               = "pool-03"
diff --git a/modules/beta-private-cluster-update-variant/README.md b/modules/beta-private-cluster-update-variant/README.md
@@ -308,6 +308,7 @@ The node_pools variable takes the following parameters:
 | effect | Effect for the taint | | Required |
 | enable_integrity_monitoring | Enables monitoring and attestation of the boot integrity of the instance. The attestation is performed against the integrity policy baseline. This baseline is initially derived from the implicitly trusted boot image when the instance is created. | true | Optional |
 | enable_secure_boot | Secure Boot helps ensure that the system only runs authentic software by verifying the digital signature of all boot components, and halting the boot process if signature verification fails. | false | Optional |
+| gpu_partition_size | Size of partitions to create on the GPU | null | Optional |
 | image_type | The image type to use for this node. Note that changing the image type will delete and recreate all nodes in the node pool | COS | Optional |
 | initial_node_count | The initial number of nodes for the pool. In regional or multi-zonal clusters, this is the number of nodes per zone. Changing this will force recreation of the resource. Defaults to the value of min_count | " " | Optional |
 | key | The key required for the taint | | Required |
diff --git a/modules/beta-private-cluster-update-variant/cluster.tf b/modules/beta-private-cluster-update-variant/cluster.tf
@@ -535,11 +535,13 @@ resource "google_container_node_pool" "pools" {
 
     guest_accelerator = [
       for guest_accelerator in lookup(each.value, "accelerator_count", 0) > 0 ? [{
-        type  = lookup(each.value, "accelerator_type", "")
-        count = lookup(each.value, "accelerator_count", 0)
+        type               = lookup(each.value, "accelerator_type", "")
+        count              = lookup(each.value, "accelerator_count", 0)
+        gpu_partition_size = lookup(each.value, "gpu_partition_size", null)
         }] : [] : {
-        type  = guest_accelerator["type"]
-        count = guest_accelerator["count"]
+        type               = guest_accelerator["type"]
+        count              = guest_accelerator["count"]
+        gpu_partition_size = guest_accelerator["gpu_partition_size"]
       }
     ]
 
diff --git a/modules/beta-private-cluster/README.md b/modules/beta-private-cluster/README.md
@@ -286,6 +286,7 @@ The node_pools variable takes the following parameters:
 | effect | Effect for the taint | | Required |
 | enable_integrity_monitoring | Enables monitoring and attestation of the boot integrity of the instance. The attestation is performed against the integrity policy baseline. This baseline is initially derived from the implicitly trusted boot image when the instance is created. | true | Optional |
 | enable_secure_boot | Secure Boot helps ensure that the system only runs authentic software by verifying the digital signature of all boot components, and halting the boot process if signature verification fails. | false | Optional |
+| gpu_partition_size | Size of partitions to create on the GPU | null | Optional |
 | image_type | The image type to use for this node. Note that changing the image type will delete and recreate all nodes in the node pool | COS | Optional |
 | initial_node_count | The initial number of nodes for the pool. In regional or multi-zonal clusters, this is the number of nodes per zone. Changing this will force recreation of the resource. Defaults to the value of min_count | " " | Optional |
 | key | The key required for the taint | | Required |
diff --git a/modules/beta-private-cluster/cluster.tf b/modules/beta-private-cluster/cluster.tf
@@ -450,11 +450,13 @@ resource "google_container_node_pool" "pools" {
 
     guest_accelerator = [
       for guest_accelerator in lookup(each.value, "accelerator_count", 0) > 0 ? [{
-        type  = lookup(each.value, "accelerator_type", "")
-        count = lookup(each.value, "accelerator_count", 0)
+        type               = lookup(each.value, "accelerator_type", "")
+        count              = lookup(each.value, "accelerator_count", 0)
+        gpu_partition_size = lookup(each.value, "gpu_partition_size", null)
         }] : [] : {
-        type  = guest_accelerator["type"]
-        count = guest_accelerator["count"]
+        type               = guest_accelerator["type"]
+        count              = guest_accelerator["count"]
+        gpu_partition_size = guest_accelerator["gpu_partition_size"]
       }
     ]
 
diff --git a/modules/beta-public-cluster-update-variant/README.md b/modules/beta-public-cluster-update-variant/README.md
@@ -295,6 +295,7 @@ The node_pools variable takes the following parameters:
 | effect | Effect for the taint | | Required |
 | enable_integrity_monitoring | Enables monitoring and attestation of the boot integrity of the instance. The attestation is performed against the integrity policy baseline. This baseline is initially derived from the implicitly trusted boot image when the instance is created. | true | Optional |
 | enable_secure_boot | Secure Boot helps ensure that the system only runs authentic software by verifying the digital signature of all boot components, and halting the boot process if signature verification fails. | false | Optional |
+| gpu_partition_size | Size of partitions to create on the GPU | null | Optional |
 | image_type | The image type to use for this node. Note that changing the image type will delete and recreate all nodes in the node pool | COS | Optional |
 | initial_node_count | The initial number of nodes for the pool. In regional or multi-zonal clusters, this is the number of nodes per zone. Changing this will force recreation of the resource. Defaults to the value of min_count | " " | Optional |
 | key | The key required for the taint | | Required |
diff --git a/modules/beta-public-cluster-update-variant/cluster.tf b/modules/beta-public-cluster-update-variant/cluster.tf
@@ -516,11 +516,13 @@ resource "google_container_node_pool" "pools" {
 
     guest_accelerator = [
       for guest_accelerator in lookup(each.value, "accelerator_count", 0) > 0 ? [{
-        type  = lookup(each.value, "accelerator_type", "")
-        count = lookup(each.value, "accelerator_count", 0)
+        type               = lookup(each.value, "accelerator_type", "")
+        count              = lookup(each.value, "accelerator_count", 0)
+        gpu_partition_size = lookup(each.value, "gpu_partition_size", null)
         }] : [] : {
-        type  = guest_accelerator["type"]
-        count = guest_accelerator["count"]
+        type               = guest_accelerator["type"]
+        count              = guest_accelerator["count"]
+        gpu_partition_size = guest_accelerator["gpu_partition_size"]
       }
     ]
 
diff --git a/modules/beta-public-cluster/README.md b/modules/beta-public-cluster/README.md
@@ -273,6 +273,7 @@ The node_pools variable takes the following parameters:
 | effect | Effect for the taint | | Required |
 | enable_integrity_monitoring | Enables monitoring and attestation of the boot integrity of the instance. The attestation is performed against the integrity policy baseline. This baseline is initially derived from the implicitly trusted boot image when the instance is created. | true | Optional |
 | enable_secure_boot | Secure Boot helps ensure that the system only runs authentic software by verifying the digital signature of all boot components, and halting the boot process if signature verification fails. | false | Optional |
+| gpu_partition_size | Size of partitions to create on the GPU | null | Optional |
 | image_type | The image type to use for this node. Note that changing the image type will delete and recreate all nodes in the node pool | COS | Optional |
 | initial_node_count | The initial number of nodes for the pool. In regional or multi-zonal clusters, this is the number of nodes per zone. Changing this will force recreation of the resource. Defaults to the value of min_count | " " | Optional |
 | key | The key required for the taint | | Required |
diff --git a/modules/beta-public-cluster/cluster.tf b/modules/beta-public-cluster/cluster.tf
@@ -431,11 +431,13 @@ resource "google_container_node_pool" "pools" {
 
     guest_accelerator = [
       for guest_accelerator in lookup(each.value, "accelerator_count", 0) > 0 ? [{
-        type  = lookup(each.value, "accelerator_type", "")
-        count = lookup(each.value, "accelerator_count", 0)
+        type               = lookup(each.value, "accelerator_type", "")
+        count              = lookup(each.value, "accelerator_count", 0)
+        gpu_partition_size = lookup(each.value, "gpu_partition_size", null)
         }] : [] : {
-        type  = guest_accelerator["type"]
-        count = guest_accelerator["count"]
+        type               = guest_accelerator["type"]
+        count              = guest_accelerator["count"]
+        gpu_partition_size = guest_accelerator["gpu_partition_size"]
       }
     ]
 
diff --git a/modules/private-cluster-update-variant/README.md b/modules/private-cluster-update-variant/README.md
@@ -270,6 +270,7 @@ The node_pools variable takes the following parameters:
 | effect | Effect for the taint | | Required |
 | enable_integrity_monitoring | Enables monitoring and attestation of the boot integrity of the instance. The attestation is performed against the integrity policy baseline. This baseline is initially derived from the implicitly trusted boot image when the instance is created. | true | Optional |
 | enable_secure_boot | Secure Boot helps ensure that the system only runs authentic software by verifying the digital signature of all boot components, and halting the boot process if signature verification fails. | false | Optional |
+| gpu_partition_size | Size of partitions to create on the GPU | null | Optional |
 | image_type | The image type to use for this node. Note that changing the image type will delete and recreate all nodes in the node pool | COS | Optional |
 | initial_node_count | The initial number of nodes for the pool. In regional or multi-zonal clusters, this is the number of nodes per zone. Changing this will force recreation of the resource. Defaults to the value of min_count | " " | Optional |
 | key | The key required for the taint | | Required |
diff --git a/modules/private-cluster-update-variant/cluster.tf b/modules/private-cluster-update-variant/cluster.tf
@@ -406,11 +406,13 @@ resource "google_container_node_pool" "pools" {
 
     guest_accelerator = [
       for guest_accelerator in lookup(each.value, "accelerator_count", 0) > 0 ? [{
-        type  = lookup(each.value, "accelerator_type", "")
-        count = lookup(each.value, "accelerator_count", 0)
+        type               = lookup(each.value, "accelerator_type", "")
+        count              = lookup(each.value, "accelerator_count", 0)
+        gpu_partition_size = lookup(each.value, "gpu_partition_size", null)
         }] : [] : {
-        type  = guest_accelerator["type"]
-        count = guest_accelerator["count"]
+        type               = guest_accelerator["type"]
+        count              = guest_accelerator["count"]
+        gpu_partition_size = guest_accelerator["gpu_partition_size"]
       }
     ]
 
diff --git a/modules/private-cluster/README.md b/modules/private-cluster/README.md
@@ -248,6 +248,7 @@ The node_pools variable takes the following parameters:
 | effect | Effect for the taint | | Required |
 | enable_integrity_monitoring | Enables monitoring and attestation of the boot integrity of the instance. The attestation is performed against the integrity policy baseline. This baseline is initially derived from the implicitly trusted boot image when the instance is created. | true | Optional |
 | enable_secure_boot | Secure Boot helps ensure that the system only runs authentic software by verifying the digital signature of all boot components, and halting the boot process if signature verification fails. | false | Optional |
+| gpu_partition_size | Size of partitions to create on the GPU | null | Optional |
 | image_type | The image type to use for this node. Note that changing the image type will delete and recreate all nodes in the node pool | COS | Optional |
 | initial_node_count | The initial number of nodes for the pool. In regional or multi-zonal clusters, this is the number of nodes per zone. Changing this will force recreation of the resource. Defaults to the value of min_count | " " | Optional |
 | key | The key required for the taint | | Required |
diff --git a/modules/private-cluster/cluster.tf b/modules/private-cluster/cluster.tf
@@ -321,11 +321,13 @@ resource "google_container_node_pool" "pools" {
 
     guest_accelerator = [
       for guest_accelerator in lookup(each.value, "accelerator_count", 0) > 0 ? [{
-        type  = lookup(each.value, "accelerator_type", "")
-        count = lookup(each.value, "accelerator_count", 0)
+        type               = lookup(each.value, "accelerator_type", "")
+        count              = lookup(each.value, "accelerator_count", 0)
+        gpu_partition_size = lookup(each.value, "gpu_partition_size", null)
         }] : [] : {
-        type  = guest_accelerator["type"]
-        count = guest_accelerator["count"]
+        type               = guest_accelerator["type"]
+        count              = guest_accelerator["count"]
+        gpu_partition_size = guest_accelerator["gpu_partition_size"]
       }
     ]
 
diff --git a/test/integration/node_pool/controls/gcloud.rb b/test/integration/node_pool/controls/gcloud.rb
@@ -17,7 +17,7 @@
 cluster_name = attribute('cluster_name')
 
 expected_accelerators_count = "1"
-expected_accelerators_type = "nvidia-tesla-p4"
+expected_accelerators_type = "nvidia-tesla-a100"
 
 control "gcloud" do
   title "Google Compute Engine GKE configuration"
@@ -207,7 +207,7 @@
             including(
               "name" => "pool-02",
               "config" => including(
-                "machineType" => "n1-standard-2",
+                "machineType" => "a2-highgpu-1g",
               ),
             )
           )
@@ -252,7 +252,8 @@
               "name" => "pool-02",
               "config" => including(
                 "accelerators" => [{"acceleratorCount" => expected_accelerators_count,
-                                    "acceleratorType" => expected_accelerators_type}],
+                                    "acceleratorType" => expected_accelerators_type,
+                                    "gpuPartitionSize" => "1g.5gb"}],
               ),
             )
           )

Original file line number	Diff line number	Diff line change
`@@ -598,9 +598,11 @@ resource "google_container_node_pool" "pools" {`
`598`	`598`	`for guest_accelerator in lookup(each.value, "accelerator_count", 0) > 0 ? [{`
`599`	`599`	`type = lookup(each.value, "accelerator_type", "")`
`600`	`600`	`count = lookup(each.value, "accelerator_count", 0)`
	`601`	`+ gpu_partition_size = lookup(each.value, "gpu_partition_size", null)`
`601`	`602`	`}] : [] : {`
`602`	`603`	`type = guest_accelerator["type"]`
`603`	`604`	`count = guest_accelerator["count"]`
	`605`	`+ gpu_partition_size = guest_accelerator["gpu_partition_size"]`
`604`	`606`	`}`
`605`	`607`	`]`
`606`	`608`
Original file line number	Diff line number	Diff line change
`@@ -17,7 +17,7 @@`
`17`	`17`	`cluster_name = attribute('cluster_name')`
`18`	`18`
`19`	`19`	`expected_accelerators_count = "1"`
`20`		`-expected_accelerators_type = "nvidia-tesla-p4"`
	`20`	`+expected_accelerators_type = "nvidia-tesla-a100"`
`21`	`21`
`22`	`22`	`control "gcloud" do`
`23`	`23`	`title "Google Compute Engine GKE configuration"`
`@@ -207,7 +207,7 @@`
`207`	`207`	`including(`
`208`	`208`	`"name" => "pool-02",`
`209`	`209`	`"config" => including(`
`210`		`- "machineType" => "n1-standard-2",`
	`210`	`+ "machineType" => "a2-highgpu-1g",`
`211`	`211`	`),`
`212`	`212`	`)`
`213`	`213`	`)`
`@@ -252,7 +252,8 @@`
`252`	`252`	`"name" => "pool-02",`
`253`	`253`	`"config" => including(`
`254`	`254`	`"accelerators" => [{"acceleratorCount" => expected_accelerators_count,`
`255`		`- "acceleratorType" => expected_accelerators_type}],`
	`255`	`+ "acceleratorType" => expected_accelerators_type,`
	`256`	`+ "gpuPartitionSize" => "1g.5gb"}],`
`256`	`257`	`),`
`257`	`258`	`)`
`258`	`259`	`)`