Skip to content

Commit c4013cf

Browse files
JerryLeiDingJerry Ding
authored andcommitted
adds support for the Dataproc on GDC SparkApplication resource (GoogleCloudPlatform#12237)
Co-authored-by: Jerry Ding <[email protected]>
1 parent 3911e25 commit c4013cf

7 files changed

+439
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,332 @@
1+
# Copyright 2024 Google Inc.
2+
# Licensed under the Apache License, Version 2.0 (the "License");
3+
# you may not use this file except in compliance with the License.
4+
# You may obtain a copy of the License at
5+
#
6+
# http://www.apache.org/licenses/LICENSE-2.0
7+
#
8+
# Unless required by applicable law or agreed to in writing, software
9+
# distributed under the License is distributed on an "AS IS" BASIS,
10+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
# See the License for the specific language governing permissions and
12+
# limitations under the License.
13+
14+
---
15+
name: SparkApplication
16+
description: A Spark application is a single Spark workload run on a GDC cluster.
17+
references:
18+
guides:
19+
'Dataproc Intro': 'https://cloud.google.com/dataproc/'
20+
api: 'https://cloud.google.com/dataproc-gdc/docs/reference/rest/v1/projects.locations.serviceInstances.sparkApplications'
21+
parameters:
22+
- name: location
23+
type: String
24+
description: 'The location of the spark application.'
25+
url_param_only: true
26+
required: true
27+
immutable: true
28+
- name: serviceinstance
29+
type: String
30+
description: 'The id of the service instance to which this spark application belongs.'
31+
url_param_only: true
32+
required: true
33+
immutable: true
34+
- name: sparkApplicationId
35+
type: String
36+
description: 'The id of the application '
37+
url_param_only: true
38+
required: true
39+
immutable: true
40+
async:
41+
actions: ['create', 'delete']
42+
type: OpAsync
43+
operation:
44+
base_url: "{{op_id}}"
45+
immutable: true
46+
examples:
47+
- name: "dataprocgdc_sparkapplication_basic"
48+
primary_resource_id: "spark-application"
49+
vars:
50+
spark_application_id: "tf-e2e-spark-app-basic"
51+
project: "my-project"
52+
test_vars_overrides:
53+
'project': '"gdce-cluster-monitoring"'
54+
- name: "dataprocgdc_sparkapplication"
55+
primary_resource_id: "spark-application"
56+
vars:
57+
spark_application_id: "tf-e2e-spark-app"
58+
application_environment_id: "tf-e2e-spark-app-env"
59+
project: "my-project"
60+
test_vars_overrides:
61+
'project': '"gdce-cluster-monitoring"'
62+
- name: "dataprocgdc_sparkapplication_pyspark"
63+
primary_resource_id: "spark-application"
64+
vars:
65+
spark_application_id: "tf-e2e-pyspark-app"
66+
project: "my-project"
67+
test_vars_overrides:
68+
'project': '"gdce-cluster-monitoring"'
69+
- name: "dataprocgdc_sparkapplication_sparkr"
70+
primary_resource_id: "spark-application"
71+
vars:
72+
spark_application_id: "tf-e2e-sparkr-app"
73+
project: "my-project"
74+
test_vars_overrides:
75+
'project': '"gdce-cluster-monitoring"'
76+
- name: "dataprocgdc_sparkapplication_sparksql"
77+
primary_resource_id: "spark-application"
78+
vars:
79+
spark_application_id: "tf-e2e-sparksql-app"
80+
project: "my-project"
81+
test_vars_overrides:
82+
'project': '"gdce-cluster-monitoring"'
83+
- name: "dataprocgdc_sparkapplication_sparksql_query_file"
84+
primary_resource_id: "spark-application"
85+
vars:
86+
spark_application_id: "tf-e2e-sparksql-app"
87+
project: "my-project"
88+
test_vars_overrides:
89+
'project': '"gdce-cluster-monitoring"'
90+
base_url: projects/{{project}}/locations/{{location}}/serviceInstances/{{serviceinstance}}/sparkApplications
91+
create_url: projects/{{project}}/locations/{{location}}/serviceInstances/{{serviceinstance}}/sparkApplications?sparkApplicationId={{spark_application_id}}
92+
self_link: projects/{{project}}/locations/{{location}}/serviceInstances/{{serviceinstance}}/sparkApplications/{{spark_application_id}}
93+
id_format: projects/{{project}}/locations/{{location}}/serviceInstances/{{serviceinstance}}/sparkApplications/{{spark_application_id}}
94+
import_format:
95+
- projects/{{project}}/locations/{{location}}/serviceInstances/{{serviceinstance}}/sparkApplications/{{spark_application_id}}
96+
autogen_async: true
97+
properties:
98+
- name: pysparkApplicationConfig
99+
type: NestedObject
100+
exactly_one_of:
101+
- 'pyspark_application_config'
102+
- 'spark_application_config'
103+
- 'spark_sql_application_config'
104+
- 'spark_r_application_config'
105+
properties:
106+
- name: mainPythonFileUri
107+
type: String
108+
description: "The HCFS URI of the main Python file to use as the driver.
109+
Must be a .py file. "
110+
required: true
111+
- name: args
112+
type: Array
113+
item_type:
114+
type: String
115+
description: "The arguments to pass to the driver. Do not include arguments, such
116+
as `--conf`, that can be set as job properties, since a collision may occur
117+
that causes an incorrect job submission. "
118+
- name: pythonFileUris
119+
type: Array
120+
item_type:
121+
type: String
122+
description: "HCFS file URIs of Python files to pass to the PySpark framework.
123+
Supported file types: .py, .egg, and .zip. "
124+
- name: jarFileUris
125+
type: Array
126+
item_type:
127+
type: String
128+
description: "HCFS URIs of jar files to add to the CLASSPATHs of the Python
129+
driver and tasks. "
130+
- name: fileUris
131+
type: Array
132+
item_type:
133+
type: String
134+
description: "HCFS URIs of files to be placed in the working directory
135+
of each executor. Useful for naively parallel tasks. "
136+
- name: archiveUris
137+
type: Array
138+
item_type:
139+
type: String
140+
description: "HCFS URIs of archives to be extracted into the working
141+
directory of each executor. Supported file types: .jar, .tar, .tar.gz, .tgz,
142+
and .zip. "
143+
description: 'Represents the PySparkApplicationConfig. '
144+
- name: sparkApplicationConfig
145+
type: NestedObject
146+
exactly_one_of:
147+
- 'pyspark_application_config'
148+
- 'spark_application_config'
149+
- 'spark_sql_application_config'
150+
- 'spark_r_application_config'
151+
properties:
152+
- name: mainJarFileUri
153+
type: String
154+
description: 'The HCFS URI of the jar file that contains the main class. '
155+
- name: mainClass
156+
type: String
157+
description: "The name of the driver main class. The jar file that contains the
158+
class must be in the classpath or specified in `jar_file_uris`. "
159+
- name: args
160+
type: Array
161+
item_type:
162+
type: String
163+
description: "The arguments to pass to the driver. Do not include arguments that
164+
can be set as application properties, such as `--conf`, since a collision can
165+
occur that causes an incorrect application submission. "
166+
- name: jarFileUris
167+
type: Array
168+
item_type:
169+
type: String
170+
description: "HCFS URIs of jar files to add to the classpath of the Spark
171+
driver and tasks. "
172+
- name: fileUris
173+
type: Array
174+
item_type:
175+
type: String
176+
description: "HCFS URIs of files to be placed in the working directory
177+
of each executor. "
178+
- name: archiveUris
179+
type: Array
180+
item_type:
181+
type: String
182+
description: "HCFS URIs of archives to be extracted into the working
183+
directory of each executor. Supported file types: `.jar`, `.tar`, `.tar.gz`,
184+
`.tgz`, and `.zip`. "
185+
description: 'Represents the SparkApplicationConfig. '
186+
- name: sparkRApplicationConfig
187+
type: NestedObject
188+
exactly_one_of:
189+
- 'pyspark_application_config'
190+
- 'spark_application_config'
191+
- 'spark_sql_application_config'
192+
- 'spark_r_application_config'
193+
properties:
194+
- name: mainRFileUri
195+
type: String
196+
description: "The HCFS URI of the main R file to use as the driver. Must
197+
be a .R file. "
198+
required: true
199+
- name: args
200+
type: Array
201+
item_type:
202+
type: String
203+
description: "The arguments to pass to the driver. Do not include arguments, such
204+
as `--conf`, that can be set as job properties, since a collision may occur
205+
that causes an incorrect job submission. "
206+
- name: fileUris
207+
type: Array
208+
item_type:
209+
type: String
210+
description: "HCFS URIs of files to be placed in the working directory
211+
of each executor. Useful for naively parallel tasks. "
212+
- name: archiveUris
213+
type: Array
214+
item_type:
215+
type: String
216+
description: "HCFS URIs of archives to be extracted into the working
217+
directory of each executor. Supported file types: .jar, .tar, .tar.gz, .tgz,
218+
and .zip. "
219+
description: 'Represents the SparkRApplicationConfig. '
220+
- name: sparkSqlApplicationConfig
221+
type: NestedObject
222+
exactly_one_of:
223+
- 'pyspark_application_config'
224+
- 'spark_application_config'
225+
- 'spark_sql_application_config'
226+
- 'spark_r_application_config'
227+
properties:
228+
- name: queryFileUri
229+
type: String
230+
description: 'The HCFS URI of the script that contains SQL queries. '
231+
- name: queryList
232+
type: NestedObject
233+
properties:
234+
- name: queries
235+
type: Array
236+
item_type:
237+
type: String
238+
description: 'The queries to run.'
239+
required: true
240+
description: 'Represents a list of queries. '
241+
- name: scriptVariables
242+
type: KeyValuePairs
243+
description: "Mapping of query variable names to values (equivalent
244+
to the Spark SQL command: SET `name=\"value\";`). "
245+
- name: jarFileUris
246+
type: Array
247+
item_type:
248+
type: String
249+
description: 'HCFS URIs of jar files to be added to the Spark CLASSPATH. '
250+
description: 'Represents the SparkRApplicationConfig. '
251+
- name: name
252+
type: String
253+
description: "Identifier. The name of the application. Format: projects/{project}/locations/{location}/serviceInstances/{service_instance}/sparkApplications/{application} "
254+
output: true
255+
- name: uid
256+
type: String
257+
description: "System generated unique identifier for this application,
258+
formatted as UUID4. "
259+
output: true
260+
- name: displayName
261+
type: String
262+
description: 'User-provided human-readable name to be used in user interfaces. '
263+
- name: createTime
264+
type: String
265+
description: 'The timestamp when the resource was created. '
266+
output: true
267+
- name: updateTime
268+
type: String
269+
description: 'The timestamp when the resource was most recently updated. '
270+
output: true
271+
- name: state
272+
type: String
273+
description: |
274+
The current state.
275+
Possible values:
276+
* `STATE_UNSPECIFIED`
277+
* `PENDING`
278+
* `RUNNING`
279+
* `CANCELLING`
280+
* `CANCELLED`
281+
* `SUCCEEDED`
282+
* `FAILED`
283+
output: true
284+
- name: reconciling
285+
type: Boolean
286+
description: "Whether the application is currently reconciling. True
287+
if the current state of the resource does not match the intended state, and the
288+
system is working to reconcile them, whether or not the change was user initiated."
289+
output: true
290+
- name: labels
291+
type: KeyValueLabels
292+
description: "The labels to associate with this application. Labels may
293+
be used for filtering and billing tracking. "
294+
- name: annotations
295+
type: KeyValueAnnotations
296+
description: "The annotations to associate with this application. Annotations
297+
may be used to store client information, but are not used by the server. "
298+
- name: outputUri
299+
type: String
300+
description: "An HCFS URI pointing to the location of stdout and stdout
301+
of the application Mainly useful for Pantheon and gcloud Not in scope for private GA "
302+
output: true
303+
- name: monitoringEndpoint
304+
type: String
305+
description: "URL for a monitoring UI for this application (for eventual
306+
Spark PHS/UI support) Out of scope for private GA "
307+
output: true
308+
- name: properties
309+
type: KeyValuePairs
310+
description: 'application-specific properties. '
311+
- name: stateMessage
312+
type: String
313+
description: 'A message explaining the current state. '
314+
output: true
315+
- name: version
316+
type: String
317+
description: 'The Dataproc version of this application. '
318+
- name: applicationEnvironment
319+
type: String
320+
description: 'An ApplicationEnvironment from which to inherit configuration
321+
properties. '
322+
- name: namespace
323+
type: String
324+
description: "The Kubernetes namespace in which to create the application. This
325+
namespace must already exist on the cluster. "
326+
- name: dependencyImages
327+
type: Array
328+
item_type:
329+
type: String
330+
description: "List of container image uris for additional file dependencies. Dependent
331+
files are sequentially copied from each image. If a file with the same name exists
332+
in 2 images then the file from later image is used. "
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
resource "google_dataproc_gdc_application_environment" "app_env" {
2+
application_environment_id = "{{index $.Vars "application_environment_id"}}"
3+
serviceinstance = "do-not-delete-dataproc-gdc-instance"
4+
project = "{{index $.Vars "project"}}"
5+
location = "us-west2"
6+
namespace = "default"
7+
}
8+
9+
resource "google_dataproc_gdc_spark_application" "{{$.PrimaryResourceId}}" {
10+
spark_application_id = "{{index $.Vars "spark_application_id"}}"
11+
serviceinstance = "do-not-delete-dataproc-gdc-instance"
12+
project = "{{index $.Vars "project"}}"
13+
location = "us-west2"
14+
namespace = "default"
15+
labels = {
16+
"test-label": "label-value"
17+
}
18+
annotations = {
19+
"an_annotation": "annotation_value"
20+
}
21+
properties = {
22+
"spark.executor.instances": "2"
23+
}
24+
application_environment = google_dataproc_gdc_application_environment.app_env.name
25+
version = "1.2"
26+
spark_application_config {
27+
main_jar_file_uri = "file:///usr/lib/spark/examples/jars/spark-examples.jar"
28+
jar_file_uris = ["file:///usr/lib/spark/examples/jars/spark-examples.jar"]
29+
archive_uris = ["file://usr/lib/spark/examples/spark-examples.jar"]
30+
file_uris = ["file:///usr/lib/spark/examples/jars/spark-examples.jar"]
31+
}
32+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
resource "google_dataproc_gdc_spark_application" "{{$.PrimaryResourceId}}" {
2+
spark_application_id = "{{index $.Vars "spark_application_id"}}"
3+
serviceinstance = "do-not-delete-dataproc-gdc-instance"
4+
project = "{{index $.Vars "project"}}"
5+
location = "us-west2"
6+
namespace = "default"
7+
spark_application_config {
8+
main_class = "org.apache.spark.examples.SparkPi"
9+
jar_file_uris = ["file:///usr/lib/spark/examples/jars/spark-examples.jar"]
10+
args = ["10000"]
11+
}
12+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
resource "google_dataproc_gdc_spark_application" "{{$.PrimaryResourceId}}" {
2+
spark_application_id = "{{index $.Vars "spark_application_id"}}"
3+
serviceinstance = "do-not-delete-dataproc-gdc-instance"
4+
project = "{{index $.Vars "project"}}"
5+
location = "us-west2"
6+
namespace = "default"
7+
display_name = "A Pyspark application for a Terraform create test"
8+
dependency_images = ["gcr.io/some/image"]
9+
pyspark_application_config {
10+
main_python_file_uri = "gs://goog-dataproc-initialization-actions-us-west2/conda/test_conda.py"
11+
jar_file_uris = ["file:///usr/lib/spark/examples/jars/spark-examples.jar"]
12+
python_file_uris = ["gs://goog-dataproc-initialization-actions-us-west2/conda/get-sys-exec.py"]
13+
file_uris = ["file://usr/lib/spark/examples/spark-examples.jar"]
14+
archive_uris = ["file://usr/lib/spark/examples/spark-examples.jar"]
15+
args = ["10"]
16+
}
17+
}

0 commit comments

Comments
 (0)