Skip to content

Commit cff1c1b

Browse files
authored
feat: Add support for gpu_partition_size (#1072)
* Add support for gpu_partition_size * Add test for gpu_partition_size * Lint formatting fix
1 parent 5b16a50 commit cff1c1b

File tree

18 files changed

+70
-44
lines changed

18 files changed

+70
-44
lines changed

README.md

+1
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,7 @@ The node_pools variable takes the following parameters:
236236
| effect | Effect for the taint | | Required |
237237
| enable_integrity_monitoring | Enables monitoring and attestation of the boot integrity of the instance. The attestation is performed against the integrity policy baseline. This baseline is initially derived from the implicitly trusted boot image when the instance is created. | true | Optional |
238238
| enable_secure_boot | Secure Boot helps ensure that the system only runs authentic software by verifying the digital signature of all boot components, and halting the boot process if signature verification fails. | false | Optional |
239+
| gpu_partition_size | Size of partitions to create on the GPU | null | Optional |
239240
| image_type | The image type to use for this node. Note that changing the image type will delete and recreate all nodes in the node pool | COS | Optional |
240241
| initial_node_count | The initial number of nodes for the pool. In regional or multi-zonal clusters, this is the number of nodes per zone. Changing this will force recreation of the resource. Defaults to the value of min_count | " " | Optional |
241242
| key | The key required for the taint | | Required |

autogen/main/README.md

+1
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,7 @@ The node_pools variable takes the following parameters:
186186
| effect | Effect for the taint | | Required |
187187
| enable_integrity_monitoring | Enables monitoring and attestation of the boot integrity of the instance. The attestation is performed against the integrity policy baseline. This baseline is initially derived from the implicitly trusted boot image when the instance is created. | true | Optional |
188188
| enable_secure_boot | Secure Boot helps ensure that the system only runs authentic software by verifying the digital signature of all boot components, and halting the boot process if signature verification fails. | false | Optional |
189+
| gpu_partition_size | Size of partitions to create on the GPU | null | Optional |
189190
| image_type | The image type to use for this node. Note that changing the image type will delete and recreate all nodes in the node pool | COS | Optional |
190191
| initial_node_count | The initial number of nodes for the pool. In regional or multi-zonal clusters, this is the number of nodes per zone. Changing this will force recreation of the resource. Defaults to the value of min_count | " " | Optional |
191192
| key | The key required for the taint | | Required |

autogen/main/cluster.tf.tmpl

+2
Original file line numberDiff line numberDiff line change
@@ -598,9 +598,11 @@ resource "google_container_node_pool" "pools" {
598598
for guest_accelerator in lookup(each.value, "accelerator_count", 0) > 0 ? [{
599599
type = lookup(each.value, "accelerator_type", "")
600600
count = lookup(each.value, "accelerator_count", 0)
601+
gpu_partition_size = lookup(each.value, "gpu_partition_size", null)
601602
}] : [] : {
602603
type = guest_accelerator["type"]
603604
count = guest_accelerator["count"]
605+
gpu_partition_size = guest_accelerator["gpu_partition_size"]
604606
}
605607
]
606608

cluster.tf

+6-4
Original file line numberDiff line numberDiff line change
@@ -308,11 +308,13 @@ resource "google_container_node_pool" "pools" {
308308

309309
guest_accelerator = [
310310
for guest_accelerator in lookup(each.value, "accelerator_count", 0) > 0 ? [{
311-
type = lookup(each.value, "accelerator_type", "")
312-
count = lookup(each.value, "accelerator_count", 0)
311+
type = lookup(each.value, "accelerator_type", "")
312+
count = lookup(each.value, "accelerator_count", 0)
313+
gpu_partition_size = lookup(each.value, "gpu_partition_size", null)
313314
}] : [] : {
314-
type = guest_accelerator["type"]
315-
count = guest_accelerator["count"]
315+
type = guest_accelerator["type"]
316+
count = guest_accelerator["count"]
317+
gpu_partition_size = guest_accelerator["gpu_partition_size"]
316318
}
317319
]
318320

examples/node_pool/main.tf

+14-13
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ locals {
1919
}
2020

2121
provider "google-beta" {
22-
version = "~> 3.79.0"
22+
version = "~> 3.90.0"
2323
region = var.region
2424
}
2525

@@ -55,18 +55,19 @@ module "gke" {
5555
auto_upgrade = true
5656
},
5757
{
58-
name = "pool-02"
59-
machine_type = "n1-standard-2"
60-
min_count = 1
61-
max_count = 2
62-
local_ssd_count = 0
63-
disk_size_gb = 30
64-
disk_type = "pd-standard"
65-
accelerator_count = 1
66-
accelerator_type = "nvidia-tesla-p4"
67-
image_type = "COS"
68-
auto_repair = false
69-
service_account = var.compute_engine_service_account
58+
name = "pool-02"
59+
machine_type = "a2-highgpu-1g"
60+
min_count = 1
61+
max_count = 2
62+
local_ssd_count = 0
63+
disk_size_gb = 30
64+
disk_type = "pd-standard"
65+
accelerator_count = 1
66+
accelerator_type = "nvidia-tesla-a100"
67+
gpu_partition_size = "1g.5gb"
68+
image_type = "COS"
69+
auto_repair = false
70+
service_account = var.compute_engine_service_account
7071
},
7172
{
7273
name = "pool-03"

modules/beta-private-cluster-update-variant/README.md

+1
Original file line numberDiff line numberDiff line change
@@ -308,6 +308,7 @@ The node_pools variable takes the following parameters:
308308
| effect | Effect for the taint | | Required |
309309
| enable_integrity_monitoring | Enables monitoring and attestation of the boot integrity of the instance. The attestation is performed against the integrity policy baseline. This baseline is initially derived from the implicitly trusted boot image when the instance is created. | true | Optional |
310310
| enable_secure_boot | Secure Boot helps ensure that the system only runs authentic software by verifying the digital signature of all boot components, and halting the boot process if signature verification fails. | false | Optional |
311+
| gpu_partition_size | Size of partitions to create on the GPU | null | Optional |
311312
| image_type | The image type to use for this node. Note that changing the image type will delete and recreate all nodes in the node pool | COS | Optional |
312313
| initial_node_count | The initial number of nodes for the pool. In regional or multi-zonal clusters, this is the number of nodes per zone. Changing this will force recreation of the resource. Defaults to the value of min_count | " " | Optional |
313314
| key | The key required for the taint | | Required |

modules/beta-private-cluster-update-variant/cluster.tf

+6-4
Original file line numberDiff line numberDiff line change
@@ -535,11 +535,13 @@ resource "google_container_node_pool" "pools" {
535535

536536
guest_accelerator = [
537537
for guest_accelerator in lookup(each.value, "accelerator_count", 0) > 0 ? [{
538-
type = lookup(each.value, "accelerator_type", "")
539-
count = lookup(each.value, "accelerator_count", 0)
538+
type = lookup(each.value, "accelerator_type", "")
539+
count = lookup(each.value, "accelerator_count", 0)
540+
gpu_partition_size = lookup(each.value, "gpu_partition_size", null)
540541
}] : [] : {
541-
type = guest_accelerator["type"]
542-
count = guest_accelerator["count"]
542+
type = guest_accelerator["type"]
543+
count = guest_accelerator["count"]
544+
gpu_partition_size = guest_accelerator["gpu_partition_size"]
543545
}
544546
]
545547

modules/beta-private-cluster/README.md

+1
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,7 @@ The node_pools variable takes the following parameters:
286286
| effect | Effect for the taint | | Required |
287287
| enable_integrity_monitoring | Enables monitoring and attestation of the boot integrity of the instance. The attestation is performed against the integrity policy baseline. This baseline is initially derived from the implicitly trusted boot image when the instance is created. | true | Optional |
288288
| enable_secure_boot | Secure Boot helps ensure that the system only runs authentic software by verifying the digital signature of all boot components, and halting the boot process if signature verification fails. | false | Optional |
289+
| gpu_partition_size | Size of partitions to create on the GPU | null | Optional |
289290
| image_type | The image type to use for this node. Note that changing the image type will delete and recreate all nodes in the node pool | COS | Optional |
290291
| initial_node_count | The initial number of nodes for the pool. In regional or multi-zonal clusters, this is the number of nodes per zone. Changing this will force recreation of the resource. Defaults to the value of min_count | " " | Optional |
291292
| key | The key required for the taint | | Required |

modules/beta-private-cluster/cluster.tf

+6-4
Original file line numberDiff line numberDiff line change
@@ -450,11 +450,13 @@ resource "google_container_node_pool" "pools" {
450450

451451
guest_accelerator = [
452452
for guest_accelerator in lookup(each.value, "accelerator_count", 0) > 0 ? [{
453-
type = lookup(each.value, "accelerator_type", "")
454-
count = lookup(each.value, "accelerator_count", 0)
453+
type = lookup(each.value, "accelerator_type", "")
454+
count = lookup(each.value, "accelerator_count", 0)
455+
gpu_partition_size = lookup(each.value, "gpu_partition_size", null)
455456
}] : [] : {
456-
type = guest_accelerator["type"]
457-
count = guest_accelerator["count"]
457+
type = guest_accelerator["type"]
458+
count = guest_accelerator["count"]
459+
gpu_partition_size = guest_accelerator["gpu_partition_size"]
458460
}
459461
]
460462

modules/beta-public-cluster-update-variant/README.md

+1
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,7 @@ The node_pools variable takes the following parameters:
295295
| effect | Effect for the taint | | Required |
296296
| enable_integrity_monitoring | Enables monitoring and attestation of the boot integrity of the instance. The attestation is performed against the integrity policy baseline. This baseline is initially derived from the implicitly trusted boot image when the instance is created. | true | Optional |
297297
| enable_secure_boot | Secure Boot helps ensure that the system only runs authentic software by verifying the digital signature of all boot components, and halting the boot process if signature verification fails. | false | Optional |
298+
| gpu_partition_size | Size of partitions to create on the GPU | null | Optional |
298299
| image_type | The image type to use for this node. Note that changing the image type will delete and recreate all nodes in the node pool | COS | Optional |
299300
| initial_node_count | The initial number of nodes for the pool. In regional or multi-zonal clusters, this is the number of nodes per zone. Changing this will force recreation of the resource. Defaults to the value of min_count | " " | Optional |
300301
| key | The key required for the taint | | Required |

modules/beta-public-cluster-update-variant/cluster.tf

+6-4
Original file line numberDiff line numberDiff line change
@@ -516,11 +516,13 @@ resource "google_container_node_pool" "pools" {
516516

517517
guest_accelerator = [
518518
for guest_accelerator in lookup(each.value, "accelerator_count", 0) > 0 ? [{
519-
type = lookup(each.value, "accelerator_type", "")
520-
count = lookup(each.value, "accelerator_count", 0)
519+
type = lookup(each.value, "accelerator_type", "")
520+
count = lookup(each.value, "accelerator_count", 0)
521+
gpu_partition_size = lookup(each.value, "gpu_partition_size", null)
521522
}] : [] : {
522-
type = guest_accelerator["type"]
523-
count = guest_accelerator["count"]
523+
type = guest_accelerator["type"]
524+
count = guest_accelerator["count"]
525+
gpu_partition_size = guest_accelerator["gpu_partition_size"]
524526
}
525527
]
526528

modules/beta-public-cluster/README.md

+1
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,7 @@ The node_pools variable takes the following parameters:
273273
| effect | Effect for the taint | | Required |
274274
| enable_integrity_monitoring | Enables monitoring and attestation of the boot integrity of the instance. The attestation is performed against the integrity policy baseline. This baseline is initially derived from the implicitly trusted boot image when the instance is created. | true | Optional |
275275
| enable_secure_boot | Secure Boot helps ensure that the system only runs authentic software by verifying the digital signature of all boot components, and halting the boot process if signature verification fails. | false | Optional |
276+
| gpu_partition_size | Size of partitions to create on the GPU | null | Optional |
276277
| image_type | The image type to use for this node. Note that changing the image type will delete and recreate all nodes in the node pool | COS | Optional |
277278
| initial_node_count | The initial number of nodes for the pool. In regional or multi-zonal clusters, this is the number of nodes per zone. Changing this will force recreation of the resource. Defaults to the value of min_count | " " | Optional |
278279
| key | The key required for the taint | | Required |

modules/beta-public-cluster/cluster.tf

+6-4
Original file line numberDiff line numberDiff line change
@@ -431,11 +431,13 @@ resource "google_container_node_pool" "pools" {
431431

432432
guest_accelerator = [
433433
for guest_accelerator in lookup(each.value, "accelerator_count", 0) > 0 ? [{
434-
type = lookup(each.value, "accelerator_type", "")
435-
count = lookup(each.value, "accelerator_count", 0)
434+
type = lookup(each.value, "accelerator_type", "")
435+
count = lookup(each.value, "accelerator_count", 0)
436+
gpu_partition_size = lookup(each.value, "gpu_partition_size", null)
436437
}] : [] : {
437-
type = guest_accelerator["type"]
438-
count = guest_accelerator["count"]
438+
type = guest_accelerator["type"]
439+
count = guest_accelerator["count"]
440+
gpu_partition_size = guest_accelerator["gpu_partition_size"]
439441
}
440442
]
441443

modules/private-cluster-update-variant/README.md

+1
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,7 @@ The node_pools variable takes the following parameters:
270270
| effect | Effect for the taint | | Required |
271271
| enable_integrity_monitoring | Enables monitoring and attestation of the boot integrity of the instance. The attestation is performed against the integrity policy baseline. This baseline is initially derived from the implicitly trusted boot image when the instance is created. | true | Optional |
272272
| enable_secure_boot | Secure Boot helps ensure that the system only runs authentic software by verifying the digital signature of all boot components, and halting the boot process if signature verification fails. | false | Optional |
273+
| gpu_partition_size | Size of partitions to create on the GPU | null | Optional |
273274
| image_type | The image type to use for this node. Note that changing the image type will delete and recreate all nodes in the node pool | COS | Optional |
274275
| initial_node_count | The initial number of nodes for the pool. In regional or multi-zonal clusters, this is the number of nodes per zone. Changing this will force recreation of the resource. Defaults to the value of min_count | " " | Optional |
275276
| key | The key required for the taint | | Required |

modules/private-cluster-update-variant/cluster.tf

+6-4
Original file line numberDiff line numberDiff line change
@@ -406,11 +406,13 @@ resource "google_container_node_pool" "pools" {
406406

407407
guest_accelerator = [
408408
for guest_accelerator in lookup(each.value, "accelerator_count", 0) > 0 ? [{
409-
type = lookup(each.value, "accelerator_type", "")
410-
count = lookup(each.value, "accelerator_count", 0)
409+
type = lookup(each.value, "accelerator_type", "")
410+
count = lookup(each.value, "accelerator_count", 0)
411+
gpu_partition_size = lookup(each.value, "gpu_partition_size", null)
411412
}] : [] : {
412-
type = guest_accelerator["type"]
413-
count = guest_accelerator["count"]
413+
type = guest_accelerator["type"]
414+
count = guest_accelerator["count"]
415+
gpu_partition_size = guest_accelerator["gpu_partition_size"]
414416
}
415417
]
416418

modules/private-cluster/README.md

+1
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,7 @@ The node_pools variable takes the following parameters:
248248
| effect | Effect for the taint | | Required |
249249
| enable_integrity_monitoring | Enables monitoring and attestation of the boot integrity of the instance. The attestation is performed against the integrity policy baseline. This baseline is initially derived from the implicitly trusted boot image when the instance is created. | true | Optional |
250250
| enable_secure_boot | Secure Boot helps ensure that the system only runs authentic software by verifying the digital signature of all boot components, and halting the boot process if signature verification fails. | false | Optional |
251+
| gpu_partition_size | Size of partitions to create on the GPU | null | Optional |
251252
| image_type | The image type to use for this node. Note that changing the image type will delete and recreate all nodes in the node pool | COS | Optional |
252253
| initial_node_count | The initial number of nodes for the pool. In regional or multi-zonal clusters, this is the number of nodes per zone. Changing this will force recreation of the resource. Defaults to the value of min_count | " " | Optional |
253254
| key | The key required for the taint | | Required |

modules/private-cluster/cluster.tf

+6-4
Original file line numberDiff line numberDiff line change
@@ -321,11 +321,13 @@ resource "google_container_node_pool" "pools" {
321321

322322
guest_accelerator = [
323323
for guest_accelerator in lookup(each.value, "accelerator_count", 0) > 0 ? [{
324-
type = lookup(each.value, "accelerator_type", "")
325-
count = lookup(each.value, "accelerator_count", 0)
324+
type = lookup(each.value, "accelerator_type", "")
325+
count = lookup(each.value, "accelerator_count", 0)
326+
gpu_partition_size = lookup(each.value, "gpu_partition_size", null)
326327
}] : [] : {
327-
type = guest_accelerator["type"]
328-
count = guest_accelerator["count"]
328+
type = guest_accelerator["type"]
329+
count = guest_accelerator["count"]
330+
gpu_partition_size = guest_accelerator["gpu_partition_size"]
329331
}
330332
]
331333

test/integration/node_pool/controls/gcloud.rb

+4-3
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
cluster_name = attribute('cluster_name')
1818

1919
expected_accelerators_count = "1"
20-
expected_accelerators_type = "nvidia-tesla-p4"
20+
expected_accelerators_type = "nvidia-tesla-a100"
2121

2222
control "gcloud" do
2323
title "Google Compute Engine GKE configuration"
@@ -207,7 +207,7 @@
207207
including(
208208
"name" => "pool-02",
209209
"config" => including(
210-
"machineType" => "n1-standard-2",
210+
"machineType" => "a2-highgpu-1g",
211211
),
212212
)
213213
)
@@ -252,7 +252,8 @@
252252
"name" => "pool-02",
253253
"config" => including(
254254
"accelerators" => [{"acceleratorCount" => expected_accelerators_count,
255-
"acceleratorType" => expected_accelerators_type}],
255+
"acceleratorType" => expected_accelerators_type,
256+
"gpuPartitionSize" => "1g.5gb"}],
256257
),
257258
)
258259
)

0 commit comments

Comments
 (0)