Skip to content

Commit bf9b100

Browse files
chayankumar999amanMahendroo
authored andcommitted
Add support for Regional Managed Instance Groups Resize Request (Dynamic Workload Scheduler) (GoogleCloudPlatform#11968)
1 parent bf993f5 commit bf9b100

File tree

3 files changed

+512
-0
lines changed

3 files changed

+512
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,370 @@
1+
# Copyright 2024 Google Inc.
2+
# Licensed under the Apache License, Version 2.0 (the "License");
3+
# you may not use this file except in compliance with the License.
4+
# You may obtain a copy of the License at
5+
#
6+
# http://www.apache.org/licenses/LICENSE-2.0
7+
#
8+
# Unless required by applicable law or agreed to in writing, software
9+
# distributed under the License is distributed on an "AS IS" BASIS,
10+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
# See the License for the specific language governing permissions and
12+
# limitations under the License.
13+
14+
---
15+
name: 'RegionResizeRequest'
16+
api_resource_type_kind: InstanceGroupManagerResizeRequest
17+
min_version: beta
18+
description: |
19+
Represents a Regional Managed Instance Group Resize Request
20+
21+
Resize Requests are the Managed Instance Group implementation of Dynamic Workload Scheduler Flex Start.
22+
23+
With Dynamic Workload Scheduler in Flex Start mode, you submit a GPU capacity request for your AI/ML jobs by indicating how many you need, a duration, and your preferred region. Dynamic Workload Scheduler intelligently persists the request; once the capacity becomes available, it automatically provisions your VMs enabling your workloads to run continuously for the entire duration of the capacity allocation.
24+
references:
25+
guides:
26+
'About resize requests in a MIG': 'https://cloud.google.com/compute/docs/instance-groups/about-resize-requests-mig'
27+
# Link to the REST API reference for the resource.
28+
api: 'https://cloud.google.com/compute/docs/reference/rest/beta/regionInstanceGroupManagerResizeRequests'
29+
docs:
30+
### List Method ###
31+
base_url: 'projects/{{project}}/regions/{{region}}/instanceGroupManagers/{{instance_group_manager}}/resizeRequests'
32+
### Get Method
33+
self_link: 'projects/{{project}}/regions/{{region}}/instanceGroupManagers/{{instance_group_manager}}/resizeRequests/{{name}}'
34+
immutable: true
35+
timeouts:
36+
insert_minutes: 20
37+
update_minutes: 20
38+
delete_minutes: 20
39+
# Sets parameters for handling operations returned by the API.
40+
async:
41+
actions: ['create', 'delete', 'update']
42+
type: 'OpAsync'
43+
operation:
44+
base_url: '{{op_id}}'
45+
46+
custom_code:
47+
custom_delete: 'templates/terraform/custom_delete/compute_rmig_resize_request_delete.go.tmpl'
48+
# Examples for testing
49+
examples:
50+
- name: 'compute_rmig_resize_request'
51+
min_version: beta
52+
primary_resource_id: 'a3_resize_request'
53+
vars:
54+
resize_request_name: 'a3-dws'
55+
# Resize request parameters injected via URL
56+
parameters:
57+
- name: 'region'
58+
type: ResourceRef
59+
description: |
60+
The reference of the compute region scoping this request.
61+
url_param_only: true
62+
required: true
63+
resource: 'Region'
64+
imports: 'name'
65+
- name: 'instanceGroupManager'
66+
type: ResourceRef
67+
description: |
68+
The reference of the regional instance group manager this ResizeRequest is a part of.
69+
url_param_only: true
70+
required: true
71+
resource: 'InstanceGroupManager'
72+
imports: 'name'
73+
# Non-URL parameters including input and output parameters
74+
properties:
75+
- name: 'creationTimestamp'
76+
type: Time
77+
description: |
78+
The creation timestamp for this resize request in RFC3339 text format.
79+
output: true
80+
- name: 'state'
81+
type: String
82+
description: |
83+
Current state of the request.
84+
output: true
85+
- name: 'name'
86+
type: String
87+
description: |
88+
The name of this resize request. The name must be 1-63 characters long, and comply with RFC1035.
89+
required: true
90+
- name: 'description'
91+
type: String
92+
description: |
93+
An optional description of this resize-request.
94+
- name: 'resizeBy'
95+
type: Integer
96+
description: |
97+
The number of instances to be created by this resize request. The group's target size will be increased by this number.
98+
required: true
99+
- name: 'requestedRunDuration'
100+
type: NestedObject
101+
description: |
102+
Requested run duration for instances that will be created by this request. At the end of the run duration instances will be deleted.
103+
properties:
104+
- name: 'seconds'
105+
type: String
106+
description: |
107+
Span of time at a resolution of a second. Must be from 600 to 604800 inclusive. Note: minimum and maximum allowed range for requestedRunDuration is 10 minutes (600 seconds) and 7 days(604800 seconds) correspondingly.
108+
required: true
109+
- name: 'nanos'
110+
type: Integer
111+
description: |
112+
Span of time that's a fraction of a second at nanosecond resolution. Durations less than one second are represented with a 0 seconds field and a positive nanos field. Must be from 0 to 999,999,999 inclusive.
113+
- name: 'status'
114+
type: NestedObject
115+
description: |
116+
Status of the request.
117+
output: true
118+
properties:
119+
# Status.error
120+
- name: 'error'
121+
type: NestedObject
122+
description: |
123+
Fatal errors encountered during the queueing or provisioning phases of the ResizeRequest that caused the transition to the FAILED state. Contrary to the lastAttempt errors, this field is final and errors are never removed from here, as the ResizeRequest is not going to retry.
124+
output: true
125+
properties:
126+
- name: 'errors'
127+
type: Array
128+
description: |
129+
The array of errors encountered while processing this operation.
130+
output: true
131+
item_type:
132+
type: NestedObject
133+
properties:
134+
- name: 'code'
135+
type: String
136+
description: |
137+
The error type identifier for this error.
138+
output: true
139+
- name: 'location'
140+
type: String
141+
description: |
142+
Indicates the field in the request that caused the error. This property is optional.
143+
output: true
144+
- name: 'message'
145+
type: String
146+
description: |
147+
An optional, human-readable error message.
148+
output: true
149+
- name: 'errorDetails'
150+
type: Array
151+
description: |
152+
An array of messages that contain the error details. There is a set of defined message types to use for providing details.The syntax depends on the error code. For example, QuotaExceededInfo will have details when the error code is QUOTA_EXCEEDED.
153+
output: true
154+
item_type:
155+
type: NestedObject
156+
properties:
157+
- name: 'errorInfo'
158+
type: NestedObject
159+
output: true
160+
properties:
161+
- name: 'reason'
162+
type: String
163+
description: |
164+
The reason of the error. This is a constant value that identifies the proximate cause of the error. Error reasons are unique within a particular domain of errors.
165+
output: true
166+
- name: 'domain'
167+
type: String
168+
description: |
169+
The logical grouping to which the "reason" belongs. The error domain is typically the registered service name of the tool or product that generates the error. Example: "pubsub.googleapis.com".
170+
output: true
171+
- name: 'metadatas'
172+
type: KeyValuePairs
173+
description: |
174+
Additional structured details about this error.
175+
output: true
176+
- name: 'quotaInfo'
177+
type: NestedObject
178+
output: true
179+
properties:
180+
- name: 'metricName'
181+
type: String
182+
description: |
183+
The Compute Engine quota metric name.
184+
output: true
185+
- name: 'limitName'
186+
type: String
187+
description: |
188+
The name of the quota limit.
189+
output: true
190+
- name: 'dimensions'
191+
type: KeyValuePairs
192+
description: |
193+
The map holding related quota dimensions
194+
output: true
195+
- name: 'limit'
196+
type: Integer
197+
description: |
198+
Current effective quota limit. The limit's unit depends on the quota type or metric.
199+
output: true
200+
- name: 'futureLimit'
201+
type: Integer
202+
description: |
203+
Future quota limit being rolled out. The limit's unit depends on the quota type or metric.
204+
output: true
205+
- name: 'rolloutStatus'
206+
type: String
207+
description: |
208+
Rollout status of the future quota limit.
209+
output: true
210+
- name: 'help'
211+
type: NestedObject
212+
output: true
213+
properties:
214+
- name: 'links'
215+
type: NestedObject
216+
output: true
217+
properties:
218+
- name: 'description'
219+
type: String
220+
description: |
221+
Describes what the link offers.
222+
output: true
223+
- name: 'url'
224+
type: String
225+
description: |
226+
The URL of the link.
227+
output: true
228+
- name: 'localizedMessage'
229+
type: NestedObject
230+
output: true
231+
properties:
232+
- name: 'locale'
233+
type: String
234+
description: |
235+
The locale used following the specification defined at https://www.rfc-editor.org/rfc/bcp/bcp47.txt. Examples are: "en-US", "fr-CH", "es-MX"
236+
output: true
237+
- name: 'message'
238+
type: String
239+
description: |
240+
The localized error message in the above locale.
241+
output: true
242+
# Status.lastAttempt
243+
- name: 'lastAttempt'
244+
type: NestedObject
245+
description: |
246+
Information about the last attempt to fulfill the request. The value is temporary since the ResizeRequest can retry, as long as it's still active and the last attempt value can either be cleared or replaced with a different error. Since ResizeRequest retries infrequently, the value may be stale and no longer show an active problem. The value is cleared when ResizeRequest transitions to the final state (becomes inactive). If the final state is FAILED the error describing it will be storred in the "error" field only.
247+
output: true
248+
properties:
249+
- name: 'error'
250+
type: NestedObject
251+
description: |
252+
Fatal errors encountered during the queueing or provisioning phases of the ResizeRequest that caused the transition to the FAILED state. Contrary to the lastAttempt errors, this field is final and errors are never removed from here, as the ResizeRequest is not going to retry.
253+
output: true
254+
properties:
255+
- name: 'errors'
256+
type: Array
257+
description: |
258+
The array of errors encountered while processing this operation.
259+
output: true
260+
item_type:
261+
type: NestedObject
262+
properties:
263+
- name: 'code'
264+
type: String
265+
description: |
266+
The error type identifier for this error.
267+
output: true
268+
- name: 'location'
269+
type: String
270+
description: |
271+
Indicates the field in the request that caused the error. This property is optional.
272+
output: true
273+
- name: 'message'
274+
type: String
275+
description: |
276+
An optional, human-readable error message.
277+
output: true
278+
- name: 'errorDetails'
279+
type: Array
280+
description: |
281+
An array of messages that contain the error details. There is a set of defined message types to use for providing details.The syntax depends on the error code. For example, QuotaExceededInfo will have details when the error code is QUOTA_EXCEEDED.
282+
output: true
283+
item_type:
284+
type: NestedObject
285+
properties:
286+
- name: 'errorInfo'
287+
type: NestedObject
288+
output: true
289+
properties:
290+
- name: 'reason'
291+
type: String
292+
description: |
293+
The reason of the error. This is a constant value that identifies the proximate cause of the error. Error reasons are unique within a particular domain of errors.
294+
output: true
295+
- name: 'domain'
296+
type: String
297+
description: |
298+
The logical grouping to which the "reason" belongs. The error domain is typically the registered service name of the tool or product that generates the error. Example: "pubsub.googleapis.com".
299+
output: true
300+
- name: 'metadatas'
301+
type: KeyValuePairs
302+
description: |
303+
Additional structured details about this error.
304+
output: true
305+
- name: 'quotaInfo'
306+
type: NestedObject
307+
output: true
308+
properties:
309+
- name: 'metricName'
310+
type: String
311+
description: |
312+
The Compute Engine quota metric name.
313+
output: true
314+
- name: 'limitName'
315+
type: String
316+
description: |
317+
The name of the quota limit.
318+
output: true
319+
- name: 'dimensions'
320+
type: KeyValuePairs
321+
description: |
322+
The map holding related quota dimensions
323+
output: true
324+
- name: 'limit'
325+
type: Integer
326+
description: |
327+
Current effective quota limit. The limit's unit depends on the quota type or metric.
328+
output: true
329+
- name: 'futureLimit'
330+
type: Integer
331+
description: |
332+
Future quota limit being rolled out. The limit's unit depends on the quota type or metric.
333+
output: true
334+
- name: 'rolloutStatus'
335+
type: String
336+
description: |
337+
Rollout status of the future quota limit.
338+
output: true
339+
- name: 'help'
340+
type: NestedObject
341+
output: true
342+
properties:
343+
- name: 'links'
344+
type: NestedObject
345+
output: true
346+
properties:
347+
- name: 'description'
348+
type: String
349+
description: |
350+
Describes what the link offers.
351+
output: true
352+
- name: 'url'
353+
type: String
354+
description: |
355+
The URL of the link.
356+
output: true
357+
- name: 'localizedMessage'
358+
type: NestedObject
359+
output: true
360+
properties:
361+
- name: 'locale'
362+
type: String
363+
description: |
364+
The locale used following the specification defined at https://www.rfc-editor.org/rfc/bcp/bcp47.txt. Examples are: "en-US", "fr-CH", "es-MX"
365+
output: true
366+
- name: 'message'
367+
type: String
368+
description: |
369+
The localized error message in the above locale.
370+
output: true

0 commit comments

Comments
 (0)