|
37 | 37 |
|
38 | 38 | import apache_beam as beam
|
39 | 39 | from apache_beam import pvalue
|
40 |
| -from apache_beam.transforms import util |
41 | 40 | from apache_beam.io import filesystems as fs
|
42 | 41 | from apache_beam.io.gcp import bigquery_tools
|
43 | 42 | from apache_beam.io.gcp.bigquery_io_metadata import create_bigquery_io_metadata
|
44 | 43 | from apache_beam.metrics.metric import Lineage
|
45 | 44 | from apache_beam.options import value_provider as vp
|
46 | 45 | from apache_beam.options.pipeline_options import GoogleCloudOptions
|
47 | 46 | from apache_beam.transforms import trigger
|
| 47 | +from apache_beam.transforms import util |
48 | 48 | from apache_beam.transforms.display import DisplayDataItem
|
49 |
| -from apache_beam.transforms.util import GroupIntoBatches |
50 | 49 | from apache_beam.transforms.window import GlobalWindows
|
51 | 50 |
|
52 | 51 | # Protect against environments where bigquery library is not available.
|
@@ -1063,7 +1062,7 @@ def _write_files_with_auto_sharding(
|
1063 | 1062 | destination_data_kv_pc
|
1064 | 1063 | |
|
1065 | 1064 | 'ToHashableTableRef' >> beam.Map(bigquery_tools.to_hashable_table_ref)
|
1066 |
| - | 'WithAutoSharding' >> GroupIntoBatches.WithShardedKey( |
| 1065 | + | 'WithAutoSharding' >> util.GroupIntoBatches.WithShardedKey( |
1067 | 1066 | batch_size=_FILE_TRIGGERING_RECORD_COUNT,
|
1068 | 1067 | max_buffering_duration_secs=_FILE_TRIGGERING_BATCHING_DURATION_SECS,
|
1069 | 1068 | clock=clock)
|
@@ -1102,9 +1101,11 @@ def _load_data(
|
1102 | 1101 | of the load jobs would fail but not other. If any of them fails, then
|
1103 | 1102 | copy jobs are not triggered.
|
1104 | 1103 | """
|
1105 |
| - # Ensure that TriggerLoadJob retry inputs are deterministic by breaking |
1106 |
| - # fusion for inputs. |
1107 |
| - if not util.is_compat_version_prior_to(p.options, "2.65.0"): |
| 1104 | + self.reshuffle_before_load = not util.is_compat_version_prior_to( |
| 1105 | + p.options, "2.65.0") |
| 1106 | + if self.reshuffle_before_load: |
| 1107 | + # Ensure that TriggerLoadJob retry inputs are deterministic by breaking |
| 1108 | + # fusion for inputs. |
1108 | 1109 | partitions_using_temp_tables = (
|
1109 | 1110 | partitions_using_temp_tables
|
1110 | 1111 | | "ReshuffleBeforeLoadWithTempTables" >> beam.Reshuffle())
|
|
0 commit comments