apache · damccorm · Apr 11, 2025 · Apr 11, 2025 · Apr 11, 2025 · Apr 11, 2025
diff --git a/.github/trigger_files/beam_PostCommit_Python_Dependency.json b/.github/trigger_files/beam_PostCommit_Python_Dependency.json
@@ -1,4 +1,4 @@
 {
     "comment": "Modify this file in a trivial way to cause this test suite to run",
-    "modification": 0
+    "modification": 1
   }
diff --git a/.github/trigger_files/beam_PostCommit_Python_Xlang_IO_Direct.json b/.github/trigger_files/beam_PostCommit_Python_Xlang_IO_Direct.json
@@ -1,4 +1,4 @@
 {
   "comment": "Modify this file in a trivial way to cause this test suite to run",
-  "modification": 5
+  "modification": 6
 }
diff --git a/.github/workflows/beam_PostCommit_Python_Dependency.yml b/.github/workflows/beam_PostCommit_Python_Dependency.yml
@@ -59,7 +59,7 @@ jobs:
       matrix:
         job_name: [beam_PostCommit_Python_Dependency]
         job_phrase: [Run Python PostCommit Dependency]
-    timeout-minutes: 120
+    timeout-minutes: 180
     if: |
       github.event_name == 'workflow_dispatch' ||
       github.event_name == 'pull_request_target' ||

diff --git a/.github/workflows/beam_PostCommit_Python_Xlang_Gcp_Direct.yml b/.github/workflows/beam_PostCommit_Python_Xlang_Gcp_Direct.yml
@@ -57,7 +57,7 @@ jobs:
       (github.event_name == 'schedule' && github.repository == 'apache/beam') ||
       github.event.comment.body == 'Run Python_Xlang_Gcp_Direct PostCommit'
     runs-on: [self-hosted, ubuntu-20.04, highmem]
-    timeout-minutes: 100
+    timeout-minutes: 160
     name: ${{ matrix.job_name }} (${{ matrix.job_phrase }})
     strategy:
       matrix:
@@ -98,4 +98,4 @@ jobs:
           commit: '${{ env.prsha || env.GITHUB_SHA }}'
           comment_mode: ${{ github.event_name == 'issue_comment'  && 'always' || 'off' }}
           files: '**/pytest*.xml'
-          large_files: true
+          large_files: true
diff --git a/CHANGES.md b/CHANGES.md
@@ -65,6 +65,7 @@
 
 * New highly anticipated feature X added to Python SDK ([#X](https://github.com/apache/beam/issues/X)).
 * New highly anticipated feature Y added to Java SDK ([#Y](https://github.com/apache/beam/issues/Y)).
+* [Python] Prism runner now enabled by default for most Python pipelines using the direct runner ([#34612](https://github.com/apache/beam/pull/34612)). This may break some tests, see https://github.com/apache/beam/pull/34612 for details on how to handle issues.
 
 ## I/Os
 
@@ -81,6 +82,7 @@
 
 ## Breaking Changes
 
+* [Python] Prism runner now enabled by default for most Python pipelines using the direct runner ([#34612](https://github.com/apache/beam/pull/34612)). This may break some tests, see https://github.com/apache/beam/pull/34612 for details on how to handle issues.
 * X behavior was changed ([#X](https://github.com/apache/beam/issues/X)).
 
 ## Deprecations

diff --git a/sdks/python/apache_beam/coders/coders_test.py b/sdks/python/apache_beam/coders/coders_test.py
@@ -267,7 +267,7 @@ def test_numpy_int(self):
     # this type is not supported as the key
     import numpy as np
 
-    with self.assertRaises(TypeError):
+    with self.assertRaises(Exception):
       with TestPipeline() as p:
         indata = p | "Create" >> beam.Create([(a, int(a))
                                               for a in np.arange(3)])

diff --git a/sdks/python/apache_beam/examples/snippets/snippets_test.py b/sdks/python/apache_beam/examples/snippets/snippets_test.py
@@ -307,8 +307,8 @@ def test_bad_types(self):
     # When running this pipeline, you'd get a runtime error,
     # possibly on a remote machine, possibly very late.
 
-    with self.assertRaises(TypeError):
-      p.run()
+    with self.assertRaises(Exception):
+      p.run().wait_until_finish()
 
     # To catch this early, we can assert what types we expect.
     with self.assertRaises(typehints.TypeCheckError):
@@ -372,8 +372,8 @@ def process(self, element):
     # When running this pipeline, you'd get a runtime error,
     # possibly on a remote machine, possibly very late.
 
-    with self.assertRaises(TypeError):
-      p.run()
+    with self.assertRaises(Exception):
+      p.run().wait_until_finish()
 
     # To catch this early, we can annotate process() with the expected types.
     # Beam will then use these as type hints and perform type checking before
@@ -439,12 +439,13 @@ def test_runtime_checks_off(self):
 
   def test_runtime_checks_on(self):
     # pylint: disable=expression-not-assigned
-    with self.assertRaises(typehints.TypeCheckError):
+    with self.assertRaises(Exception):
       # [START type_hints_runtime_on]
       p = TestPipeline(options=PipelineOptions(runtime_type_check=True))
       p | beam.Create(['a']) | beam.Map(lambda x: 3).with_output_types(str)
-      p.run()
+      result = p.run()
       # [END type_hints_runtime_on]
+      result.wait_until_finish()
 
   def test_deterministic_key(self):
     with TestPipeline() as p:

diff --git a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/enrichment_test.py
@@ -60,28 +60,52 @@ def validate_enrichment_with_vertex_ai_legacy():
   return expected
 
 
+def std_out_to_dict(stdout_lines, row_key):
+  output_dict = {}
+  for stdout_line in stdout_lines:
+    # parse the stdout in a dictionary format so that it can be
+    # evaluated/compared as one. This allows us to compare without
+    # considering the order of the stdout or the order that the fields of the
+    # row are arranged in.
+    fmtd = '{\"' + stdout_line[4:-1].replace('=', '\": ').replace(
+        ', ', ', \"').replace('\"\'', '\'') + "}"
+    stdout_dict = eval(fmtd)  # pylint: disable=eval-used
+    output_dict[stdout_dict[row_key]] = stdout_dict
+  return output_dict
+
+
 @mock.patch('sys.stdout', new_callable=StringIO)
 class EnrichmentTest(unittest.TestCase):
   def test_enrichment_with_bigtable(self, mock_stdout):
     enrichment_with_bigtable()
     output = mock_stdout.getvalue().splitlines()
     expected = validate_enrichment_with_bigtable()
-    self.assertEqual(output, expected)
+
+    self.assertEqual(len(output), len(expected))
+    self.assertEqual(
+        std_out_to_dict(output, 'sale_id'),
+        std_out_to_dict(expected, 'sale_id'))
 
   def test_enrichment_with_vertex_ai(self, mock_stdout):
     enrichment_with_vertex_ai()
     output = mock_stdout.getvalue().splitlines()
     expected = validate_enrichment_with_vertex_ai()
 
-    for i in range(len(expected)):
-      self.assertEqual(set(output[i].split(',')), set(expected[i].split(',')))
+    self.assertEqual(len(output), len(expected))
+    self.assertEqual(
+        std_out_to_dict(output, 'user_id'),
+        std_out_to_dict(expected, 'user_id'))
 
   def test_enrichment_with_vertex_ai_legacy(self, mock_stdout):
     enrichment_with_vertex_ai_legacy()
     output = mock_stdout.getvalue().splitlines()
     expected = validate_enrichment_with_vertex_ai_legacy()
     self.maxDiff = None
-    self.assertEqual(output, expected)
+
+    self.assertEqual(len(output), len(expected))
+    self.assertEqual(
+        std_out_to_dict(output, 'entity_id'),
+        std_out_to_dict(expected, 'entity_id'))
 
 
 if __name__ == '__main__':

diff --git a/sdks/python/apache_beam/examples/snippets/transforms/elementwise/pardo_dofn_methods.py b/sdks/python/apache_beam/examples/snippets/transforms/elementwise/pardo_dofn_methods.py
@@ -37,8 +37,6 @@ def pardo_dofn_methods(test=None):
   # Portable runners do not guarantee that teardown will be executed, so we
   # use FnApiRunner instead of prism.
   runner = 'FnApiRunner'
-  # TODO(damccorm) - remove after next release
-  runner = 'DirectRunner'
   # [START pardo_dofn_methods]
   import apache_beam as beam
 

diff --git a/sdks/python/apache_beam/io/fileio_test.py b/sdks/python/apache_beam/io/fileio_test.py
@@ -106,7 +106,7 @@ def test_match_files_one_directory_failure1(self):
     files.append(self._create_temp_file(dir=directories[0]))
     files.append(self._create_temp_file(dir=directories[0]))
 
-    with self.assertRaises(beam.io.filesystem.BeamIOError):
+    with self.assertRaises(Exception):
       with TestPipeline() as p:
         files_pc = (
             p
@@ -259,7 +259,7 @@ def test_fail_on_directories(self):
     files.append(self._create_temp_file(dir=tempdir, content=content))
     files.append(self._create_temp_file(dir=tempdir, content=content))
 
-    with self.assertRaises(beam.io.filesystem.BeamIOError):
+    with self.assertRaises(Exception):
       with TestPipeline() as p:
         _ = (
             p
@@ -501,10 +501,14 @@ def test_write_to_dynamic_destination(self):
         fileio.TextSink()  # pass a FileSink object
     ]
 
+    # Test assumes that all records will be handled by same worker process,
+    # pin to FnApiRunner to guarantee hthis
+    runner = 'FnApiRunner'
+
     for sink in sink_params:
       dir = self._new_tempdir()
 
-      with TestPipeline() as p:
+      with TestPipeline(runner) as p:
         _ = (
             p
             | "Create" >> beam.Create(range(100))
@@ -515,7 +519,7 @@ def test_write_to_dynamic_destination(self):
                 sink=sink,
                 file_naming=fileio.destination_prefix_naming("test")))
 
-      with TestPipeline() as p:
+      with TestPipeline(runner) as p:
         result = (
             p
             | fileio.MatchFiles(FileSystems.join(dir, '*'))

diff --git a/sdks/python/apache_beam/io/gcp/bigquery_file_loads_test.py b/sdks/python/apache_beam/io/gcp/bigquery_file_loads_test.py
@@ -447,8 +447,8 @@ def test_records_traverse_transform_with_mocks(self):
         validate=False,
         temp_file_format=bigquery_tools.FileFormat.JSON)
 
-    # Need to test this with the DirectRunner to avoid serializing mocks
-    with TestPipeline('DirectRunner') as p:
+    # Need to test this with the FnApiRunner to avoid serializing mocks
+    with TestPipeline('FnApiRunner') as p:
       outputs = p | beam.Create(_ELEMENTS) | transform
 
       dest_files = outputs[bqfl.BigQueryBatchFileLoads.DESTINATION_FILE_PAIRS]

diff --git a/sdks/python/apache_beam/io/gcp/experimental/spannerio_test.py b/sdks/python/apache_beam/io/gcp/experimental/spannerio_test.py
@@ -387,7 +387,7 @@ def test_read_with_transaction(
   def test_invalid_transaction(
       self, mock_batch_snapshot_class, mock_client_class):
     # test exception raises at pipeline execution time
-    with self.assertRaises(ValueError), TestPipeline() as p:
+    with self.assertRaises(Exception), TestPipeline() as p:
       transaction = (
           p | beam.Create([{
               "invalid": "transaction"

diff --git a/sdks/python/apache_beam/io/parquetio_test.py b/sdks/python/apache_beam/io/parquetio_test.py
@@ -52,7 +52,6 @@
 
 try:
   import pyarrow as pa
-  import pyarrow.lib as pl
   import pyarrow.parquet as pq
 except ImportError:
   pa = None
@@ -338,17 +337,16 @@ def test_write_batched_display_data(self):
       ARROW_MAJOR_VERSION >= 13,
       'pyarrow 13.x and above does not throw ArrowInvalid error')
   def test_sink_transform_int96(self):
-    with tempfile.NamedTemporaryFile() as dst:
+    with self.assertRaisesRegex(Exception, 'would lose data'):
+      # Should throw an error "ArrowInvalid: Casting from timestamp[ns] to
+      # timestamp[us] would lose data"
+      dst = tempfile.NamedTemporaryFile()
       path = dst.name
-      # pylint: disable=c-extension-no-member
-      with self.assertRaises(pl.ArrowInvalid):
-        # Should throw an error "ArrowInvalid: Casting from timestamp[ns] to
-        # timestamp[us] would lose data"
-        with TestPipeline() as p:
-          _ = p \
-          | Create(self.RECORDS) \
-          | WriteToParquet(
-              path, self.SCHEMA96, num_shards=1, shard_name_template='')
+      with TestPipeline() as p:
+        _ = p \
+        | Create(self.RECORDS) \
+        | WriteToParquet(
+            path, self.SCHEMA96, num_shards=1, shard_name_template='')
 
   def test_sink_transform(self):
     with TemporaryDirectory() as tmp_dirname:
@@ -571,7 +569,8 @@ def test_selective_columns(self):
   def test_sink_transform_multiple_row_group(self):
     with TemporaryDirectory() as tmp_dirname:
       path = os.path.join(tmp_dirname + "tmp_filename")
-      with TestPipeline() as p:
+      # Pin to FnApiRunner since test assumes fixed bundle size
+      with TestPipeline('FnApiRunner') as p:
         # writing 623200 bytes of data
         _ = p \
         | Create(self.RECORDS * 4000) \

diff --git a/sdks/python/apache_beam/io/requestresponse_test.py b/sdks/python/apache_beam/io/requestresponse_test.py
@@ -31,8 +31,6 @@
   from apache_beam.io.requestresponse import Caller
   from apache_beam.io.requestresponse import DefaultThrottler
   from apache_beam.io.requestresponse import RequestResponseIO
-  from apache_beam.io.requestresponse import UserCodeExecutionException
-  from apache_beam.io.requestresponse import UserCodeTimeoutException
   from apache_beam.io.requestresponse import retry_on_exception
 except ImportError:
   raise unittest.SkipTest('RequestResponseIO dependencies are not installed.')
@@ -98,7 +96,7 @@ def test_valid_call(self):
 
   def test_call_timeout(self):
     caller = CallerWithTimeout()
-    with self.assertRaises(UserCodeTimeoutException):
+    with self.assertRaises(Exception):
       with TestPipeline() as test_pipeline:
         _ = (
             test_pipeline
@@ -107,7 +105,7 @@ def test_call_timeout(self):
 
   def test_call_runtime_error(self):
     caller = CallerWithRuntimeError()
-    with self.assertRaises(UserCodeExecutionException):
+    with self.assertRaises(Exception):
       with TestPipeline() as test_pipeline:
         _ = (
             test_pipeline
@@ -120,23 +118,23 @@ def test_retry_on_exception(self):
 
   def test_caller_backoff_retry_strategy(self):
     caller = CallerThatRetries()
-    with self.assertRaises(TooManyRequests) as cm:
+    with self.assertRaises(Exception) as cm:
       with TestPipeline() as test_pipeline:
         _ = (
             test_pipeline
             | beam.Create(["sample_request"])
             | RequestResponseIO(caller=caller))
-    self.assertRegex(cm.exception.message, 'retries = 2')
+    self.assertRegex(str(cm.exception), 'retries = 2')
 
   def test_caller_no_retry_strategy(self):
     caller = CallerThatRetries()
-    with self.assertRaises(TooManyRequests) as cm:
+    with self.assertRaises(Exception) as cm:
       with TestPipeline() as test_pipeline:
         _ = (
             test_pipeline
             | beam.Create(["sample_request"])
             | RequestResponseIO(caller=caller, repeater=None))
-    self.assertRegex(cm.exception.message, 'retries = 0')
+    self.assertRegex(str(cm.exception), 'retries = 0')
 
   @retry(
       retry=retry_if_exception_type(IndexError),
@@ -148,7 +146,11 @@ def test_default_throttler(self):
         window_ms=10000, bucket_ms=5000, overload_ratio=1)
     # manually override the number of received requests for testing.
     throttler.throttler._all_requests.add(time.time() * 1000, 100)
-    test_pipeline = TestPipeline()
+    # TODO(https://github.com/apache/beam/issues/34549): This test relies on
+    # metrics filtering which doesn't work on Prism yet because Prism renames
+    # steps (e.g. "Do" becomes "ref_AppliedPTransform_Do_7").
+    # https://github.com/apache/beam/blob/5f9cd73b7c9a2f37f83971ace3a399d633201dd1/sdks/python/apache_beam/runners/portability/fn_api_runner/fn_runner.py#L1590
+    test_pipeline = TestPipeline('FnApiRunner')
     _ = (
         test_pipeline
         | beam.Create(['sample_request'])

diff --git a/sdks/python/apache_beam/metrics/metric_test.py b/sdks/python/apache_beam/metrics/metric_test.py
@@ -201,16 +201,15 @@ def process(self, element):
     # Verify user distribution counter.
     metric_results = res.metrics().query()
     matcher = MetricResultMatcher(
-        step='ApplyPardo',
+        step=hc.contains_string('ApplyPardo'),
         namespace=hc.contains_string('SomeDoFn'),
         name='element_dist',
         committed=DistributionMatcher(
             sum_value=hc.greater_than_or_equal_to(0),
             count_value=hc.greater_than_or_equal_to(0),
             min_value=hc.greater_than_or_equal_to(0),
             max_value=hc.greater_than_or_equal_to(0)))
-    hc.assert_that(
-        metric_results['distributions'], hc.contains_inanyorder(matcher))
+    hc.assert_that(metric_results['distributions'], hc.has_item(matcher))
 
   def test_create_counter_distribution(self):
     sampler = statesampler.StateSampler('', counters.CounterFactory())

diff --git a/sdks/python/apache_beam/ml/anomaly/detectors/pyod_adapter_test.py b/sdks/python/apache_beam/ml/anomaly/detectors/pyod_adapter_test.py
@@ -142,17 +142,17 @@ def test_scoring_with_unmatched_features(self):
     # (see the `test_scoring_with_matched_features`)
     detector = PyODFactory.create_detector(self.pickled_model_uri)
     options = PipelineOptions([])
-    p = beam.Pipeline(options=options)
-    _ = (
-        p | beam.Create(self.get_test_data_with_target())
-        | beam.Map(
-            lambda x: beam.Row(**dict(zip(["a", "b", "target"], map(int, x)))))
-        | beam.WithKeys(0)
-        | AnomalyDetection(detector=detector))
-
     # This should raise a ValueError with message
     # "X has 3 features, but IsolationForest is expecting 2 features as input."
-    self.assertRaises(ValueError, p.run)
+    with self.assertRaises(Exception):
+      with beam.Pipeline(options=options) as p:
+        _ = (
+            p | beam.Create(self.get_test_data_with_target())
+            | beam.Map(
+                lambda x: beam.Row(
+                    **dict(zip(["a", "b", "target"], map(int, x)))))
+            | beam.WithKeys(0)
+            | AnomalyDetection(detector=detector))
 
 
 if __name__ == '__main__':