Skip to content

Commit 7caa09c

Browse files
authored
[IcebergIO] Add dataflow tests with and without managed experiment (#34538)
* add dataflow tests with and without managed experiment * wrap in doLast
1 parent ab3c4ef commit 7caa09c

File tree

6 files changed

+227
-12
lines changed

6 files changed

+227
-12
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{
2+
"comment": "Modify this file in a trivial way to cause this test suite to run.",
3+
"modification": 1
4+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
{
2+
"comment": "Modify this file in a trivial way to cause this test suite to run.",
3+
"modification": 1
4+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one or more
2+
# contributor license agreements. See the NOTICE file distributed with
3+
# this work for additional information regarding copyright ownership.
4+
# The ASF licenses this file to You under the Apache License, Version 2.0
5+
# (the "License"); you may not use this file except in compliance with
6+
# the License. You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
name: IcebergIO Integration Tests on Dataflow
17+
18+
on:
19+
schedule:
20+
- cron: '30 4/6 * * *'
21+
pull_request_target:
22+
paths: [ 'release/trigger_all_tests.json', '.github/trigger_files/IO_Iceberg_Integration_Tests_Dataflow.json' ]
23+
workflow_dispatch:
24+
25+
# Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event
26+
permissions:
27+
actions: write
28+
pull-requests: write
29+
checks: write
30+
contents: read
31+
deployments: read
32+
id-token: none
33+
issues: write
34+
discussions: read
35+
packages: read
36+
pages: read
37+
repository-projects: read
38+
security-events: read
39+
statuses: read
40+
41+
# This allows a subsequently queued workflow run to interrupt previous runs
42+
concurrency:
43+
group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}'
44+
cancel-in-progress: true
45+
46+
env:
47+
DEVELOCITY_ACCESS_KEY: ${{ secrets.DEVELOCITY_ACCESS_KEY }}
48+
GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }}
49+
GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }}
50+
51+
jobs:
52+
IO_Iceberg_Integration_Tests_Dataflow:
53+
if: |
54+
github.event_name == 'pull_request_target' ||
55+
github.event_name == 'workflow_dispatch' ||
56+
(github.event_name == 'schedule' && github.repository == 'apache/beam') ||
57+
github.event.comment.body == 'Run IcebergIO Integration Tests on Dataflow'
58+
runs-on: [self-hosted, ubuntu-20.04, main]
59+
timeout-minutes: 120
60+
name: ${{ matrix.job_name }} (${{ matrix.job_phrase }})
61+
strategy:
62+
matrix:
63+
job_name: ["IO_Iceberg_Integration_Tests_Dataflow"]
64+
job_phrase: ["Run IcebergIO Integration Tests on Dataflow"]
65+
steps:
66+
- uses: actions/checkout@v4
67+
- name: Setup repository
68+
uses: ./.github/actions/setup-action
69+
with:
70+
comment_phrase: ${{ matrix.job_phrase }}
71+
github_token: ${{ secrets.GITHUB_TOKEN }}
72+
github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }})
73+
- name: Setup environment
74+
uses: ./.github/actions/setup-environment-action
75+
- name: Run IcebergIO Integration Tests on Dataflow
76+
uses: ./.github/actions/gradle-command-self-hosted-action
77+
with:
78+
gradle-command: :sdks:java:io:iceberg:dataflowIntegrationTest --info
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one or more
2+
# contributor license agreements. See the NOTICE file distributed with
3+
# this work for additional information regarding copyright ownership.
4+
# The ASF licenses this file to You under the Apache License, Version 2.0
5+
# (the "License"); you may not use this file except in compliance with
6+
# the License. You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
name: IcebergIO Managed Integration Tests on Dataflow
17+
18+
on:
19+
schedule:
20+
- cron: '30 4/6 * * *'
21+
pull_request_target:
22+
paths: [ 'release/trigger_all_tests.json', '.github/trigger_files/IO_Iceberg_Managed_Integration_Tests_Dataflow.json' ]
23+
workflow_dispatch:
24+
25+
# Setting explicit permissions for the action to avoid the default permissions which are `write-all` in case of pull_request_target event
26+
permissions:
27+
actions: write
28+
pull-requests: write
29+
checks: write
30+
contents: read
31+
deployments: read
32+
id-token: none
33+
issues: write
34+
discussions: read
35+
packages: read
36+
pages: read
37+
repository-projects: read
38+
security-events: read
39+
statuses: read
40+
41+
# This allows a subsequently queued workflow run to interrupt previous runs
42+
concurrency:
43+
group: '${{ github.workflow }} @ ${{ github.event.issue.number || github.sha || github.head_ref || github.ref }}-${{ github.event.schedule || github.event.comment.id || github.event.sender.login }}'
44+
cancel-in-progress: true
45+
46+
env:
47+
DEVELOCITY_ACCESS_KEY: ${{ secrets.DEVELOCITY_ACCESS_KEY }}
48+
GRADLE_ENTERPRISE_CACHE_USERNAME: ${{ secrets.GE_CACHE_USERNAME }}
49+
GRADLE_ENTERPRISE_CACHE_PASSWORD: ${{ secrets.GE_CACHE_PASSWORD }}
50+
51+
jobs:
52+
IO_Iceberg_Managed_Integration_Tests_Dataflow:
53+
if: |
54+
github.event_name == 'pull_request_target' ||
55+
github.event_name == 'workflow_dispatch' ||
56+
(github.event_name == 'schedule' && github.repository == 'apache/beam') ||
57+
github.event.comment.body == 'Run IcebergIO Managed Integration Tests on Dataflow'
58+
runs-on: [self-hosted, ubuntu-20.04, main]
59+
timeout-minutes: 120
60+
name: ${{ matrix.job_name }} (${{ matrix.job_phrase }})
61+
strategy:
62+
matrix:
63+
job_name: ["IO_Iceberg_Managed_Integration_Tests_Dataflow"]
64+
job_phrase: ["Run IcebergIO Managed Integration Tests on Dataflow"]
65+
steps:
66+
- uses: actions/checkout@v4
67+
- name: Setup repository
68+
uses: ./.github/actions/setup-action
69+
with:
70+
comment_phrase: ${{ matrix.job_phrase }}
71+
github_token: ${{ secrets.GITHUB_TOKEN }}
72+
github_job: ${{ matrix.job_name }} (${{ matrix.job_phrase }})
73+
- name: Setup environment
74+
uses: ./.github/actions/setup-environment-action
75+
- name: Run IcebergIO Managed Integration Tests on Dataflow
76+
uses: ./.github/actions/gradle-command-self-hosted-action
77+
with:
78+
gradle-command: :sdks:java:io:iceberg:dataflowIntegrationTest
79+
arguments: |
80+
--info \
81+
-PenableManagedTransforms

sdks/java/io/iceberg/build.gradle

+43-6
Original file line numberDiff line numberDiff line change
@@ -128,10 +128,12 @@ hadoopVersions.each { kv ->
128128
}
129129
}
130130

131+
def gcpProject = project.findProperty('gcpProject') ?: 'apache-beam-testing'
132+
def gcpTempLocation = project.findProperty('gcpTempLocation') ?: 'gs://managed-iceberg-integration-tests/temp'
133+
def usingJava8 = (project.findProperty('testJavaVersion') == '8' || JavaVersion.current().equals(JavaVersion.VERSION_1_8))
134+
131135
task integrationTest(type: Test) {
132136
group = "Verification"
133-
def gcpProject = project.findProperty('gcpProject') ?: 'apache-beam-testing'
134-
def gcpTempLocation = project.findProperty('gcpTempLocation') ?: 'gs://managed-iceberg-integration-tests'
135137
systemProperty "beamTestPipelineOptions", JsonOutput.toJson([
136138
"--project=${gcpProject}",
137139
"--tempLocation=${gcpTempLocation}",
@@ -142,8 +144,7 @@ task integrationTest(type: Test) {
142144

143145
include '**/*IT.class'
144146
// BQ metastore catalog doesn't support java 8
145-
if (project.findProperty('testJavaVersion') == '8' ||
146-
JavaVersion.current().equals(JavaVersion.VERSION_1_8)) {
147+
if (usingJava8) {
147148
exclude '**/BigQueryMetastoreCatalogIT.class'
148149
}
149150

@@ -152,9 +153,45 @@ task integrationTest(type: Test) {
152153
testClassesDirs = sourceSets.test.output.classesDirs
153154
}
154155

156+
task dataflowIntegrationTest(type: Test) {
157+
group = "Verification"
158+
def args = [
159+
"--runner=DataflowRunner",
160+
"--region=us-central1",
161+
"--project=${gcpProject}",
162+
"--tempLocation=${gcpTempLocation}",
163+
"--tempRoot=${gcpTempLocation}",
164+
"--experiments=use_runner_v2"
165+
]
166+
if (project.hasProperty('enableManagedTransforms')) {
167+
args.add("--experiments=enable_managed_transforms")
168+
}
169+
systemProperty "beamTestPipelineOptions", JsonOutput.toJson(args)
170+
171+
// Disable Gradle cache: these ITs interact with live service that should always be considered "out of date"
172+
outputs.upToDateWhen { false }
173+
174+
filter {
175+
includeTestsMatching 'org.apache.beam.sdk.io.iceberg.catalog.BigQueryMetastoreCatalogIT.testRead'
176+
includeTestsMatching 'org.apache.beam.sdk.io.iceberg.catalog.BigQueryMetastoreCatalogIT.testStreamingRead'
177+
includeTestsMatching 'org.apache.beam.sdk.io.iceberg.catalog.BigQueryMetastoreCatalogIT.testWrite'
178+
includeTestsMatching 'org.apache.beam.sdk.io.iceberg.catalog.BigQueryMetastoreCatalogIT.testWriteRead'
179+
includeTestsMatching 'org.apache.beam.sdk.io.iceberg.catalog.BigQueryMetastoreCatalogIT.testReadWriteStreaming'
180+
includeTestsMatching 'org.apache.beam.sdk.io.iceberg.catalog.BigQueryMetastoreCatalogIT.testStreamToPartitionedDynamicDestinations'
181+
}
182+
183+
doLast {
184+
if (usingJava8) {
185+
throw new StopExecutionException("BigQueryMetastoreCatalog doesn't support Java 8");
186+
}
187+
}
188+
189+
maxParallelForks 4
190+
classpath = sourceSets.test.runtimeClasspath
191+
testClassesDirs = sourceSets.test.output.classesDirs
192+
}
193+
155194
task loadTest(type: Test) {
156-
def gcpProject = project.findProperty('gcpProject') ?: 'apache-beam-testing'
157-
def gcpTempLocation = project.findProperty('gcpTempLocation') ?: 'gs://temp-storage-for-end-to-end-tests/temp-lt'
158195
systemProperty "beamTestPipelineOptions", JsonOutput.toJson([
159196
"--project=${gcpProject}",
160197
"--tempLocation=${gcpTempLocation}",

sdks/java/io/iceberg/src/test/java/org/apache/beam/sdk/io/iceberg/catalog/IcebergCatalogBaseIT.java

+17-6
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
import java.util.stream.LongStream;
4646
import java.util.stream.Stream;
4747
import org.apache.beam.runners.direct.DirectOptions;
48+
import org.apache.beam.runners.direct.DirectRunner;
4849
import org.apache.beam.sdk.extensions.gcp.options.GcpOptions;
4950
import org.apache.beam.sdk.extensions.gcp.options.GcsOptions;
5051
import org.apache.beam.sdk.extensions.gcp.util.GcsUtil;
@@ -212,7 +213,11 @@ public void cleanUp() throws Exception {
212213
private static final String RANDOM = UUID.randomUUID().toString();
213214
@Rule public TestPipeline pipeline = TestPipeline.create();
214215
@Rule public TestName testName = new TestName();
215-
@Rule public transient Timeout globalTimeout = Timeout.seconds(180);
216+
217+
@Rule
218+
public transient Timeout globalTimeout =
219+
Timeout.seconds(OPTIONS.getRunner().equals(DirectRunner.class) ? 180 : 20 * 60);
220+
216221
private static final int NUM_SHARDS = 10;
217222
private static final Logger LOG = LoggerFactory.getLogger(IcebergCatalogBaseIT.class);
218223
private static final Schema DOUBLY_NESTED_ROW_SCHEMA =
@@ -444,15 +449,18 @@ public void testStreamingReadBetweenTimestamps() throws Exception {
444449
public void testWriteRead() throws IOException {
445450
Table table = catalog.createTable(TableIdentifier.parse(tableId()), ICEBERG_SCHEMA);
446451
List<Row> expectedRows = populateTable(table);
447-
Map<String, Object> config = managedIcebergConfig(tableId());
452+
Map<String, Object> readConfig = managedIcebergConfig(tableId());
453+
String writeTableId = tableId() + "_2";
454+
Map<String, Object> writeConfig = managedIcebergConfig(writeTableId);
448455

449456
pipeline
450-
.apply("read", Managed.read(ICEBERG).withConfig(config))
457+
.apply("read", Managed.read(ICEBERG).withConfig(readConfig))
451458
.getSinglePCollection()
452-
.apply("write", Managed.write(ICEBERG).withConfig(config));
459+
.apply("write", Managed.write(ICEBERG).withConfig(writeConfig));
453460
pipeline.run().waitUntilFinish();
454461

455-
List<Record> returnedRecords = readRecords(table);
462+
List<Record> returnedRecords =
463+
readRecords(catalog.loadTable(TableIdentifier.parse(writeTableId)));
456464
assertThat(
457465
returnedRecords,
458466
containsInAnyOrder(expectedRows.stream().map(RECORD_FUNC::apply).toArray()));
@@ -469,16 +477,19 @@ public void testReadWriteStreaming() throws IOException {
469477
readConfig.put("to_timestamp", System.currentTimeMillis());
470478
readConfig.put("streaming", true);
471479

480+
String writeTableId = tableId() + "_2";
472481
Map<String, Object> writeConfig = new HashMap<>(config);
473482
writeConfig.put("triggering_frequency_seconds", 5);
483+
writeConfig.put("table", writeTableId);
474484

475485
pipeline
476486
.apply("streaming read", Managed.read(ICEBERG_CDC).withConfig(readConfig))
477487
.getSinglePCollection()
478488
.apply("streaming write", Managed.write(ICEBERG).withConfig(writeConfig));
479489
pipeline.run().waitUntilFinish();
480490

481-
List<Record> returnedRecords = readRecords(table);
491+
List<Record> returnedRecords =
492+
readRecords(catalog.loadTable(TableIdentifier.parse(writeTableId)));
482493
assertThat(
483494
returnedRecords,
484495
containsInAnyOrder(expectedRows.stream().map(RECORD_FUNC::apply).toArray()));

0 commit comments

Comments
 (0)