Add --fix-tests option to more easily generate YAML tests.

robertwb · robertwb · commit 902f75b16626 · 2025-04-07T11:09:36.000-07:00
diff --git a/sdks/python/apache_beam/yaml/main.py b/sdks/python/apache_beam/yaml/main.py
@@ -30,6 +30,7 @@
 from apache_beam.typehints.schemas import MillisInstant
 from apache_beam.yaml import yaml_testing
 from apache_beam.yaml import yaml_transform
+from apache_beam.yaml import yaml_utils
 
 
 def _preparse_jinja_flags(argv):
@@ -98,6 +99,10 @@ def _parse_arguments(argv):
       action=argparse.BooleanOptionalAction,
       help='Run the tests associated with the given pipeline, rather than the '
       'pipeline itself.')
+  parser.add_argument(
+      '--fix_tests',
+      action=argparse.BooleanOptionalAction,
+      help='Update failing test expectations to match the actual ouput.')
   parser.add_argument(
       '--test_suite',
       help='Run the given tests against the given pipeline, rather than the '
@@ -149,24 +154,54 @@ def run_tests(argv=None, exit=True):
   pipeline_spec = yaml.load(pipeline_yaml, Loader=yaml_transform.SafeLineLoader)
   options = _build_pipeline_options(pipeline_spec, pipeline_args)
 
-  test_specs = pipeline_spec.get('tests', [])
-  if not isinstance(test_specs, list):
-    raise TypeError('tests attribute must be a list of test specifications')
   if known_args.test_suite:
     with open(known_args.test_suite) as fin:
-      more_test_specs = yaml.load(fin, Loader=yaml_transform.SafeLineLoader)
-    if 'tests' not in more_test_specs or not isinstance(
-        more_test_specs['tests'], list):
+      test_suite = yaml.load(fin, Loader=yaml_transform.SafeLineLoader)
+    if 'tests' not in test_suite or not isinstance(test_suite['tests'], list):
+      raise TypeError('tests attribute must be a list of test specifications')
+    test_specs = test_suite['tests']
+  else:
+    test_specs = pipeline_spec.get('tests', [])
+    if not isinstance(test_specs, list):
       raise TypeError('tests attribute must be a list of test specifications')
-    test_specs += more_test_specs['tests']
   if not test_specs:
     raise RuntimeError('No tests found.')
 
   with _fix_xlang_instant_coding():
-    suite = unittest.TestSuite()
-    for test_spec in test_specs:
-      suite.addTest(_YamlTestCase(pipeline_spec, test_spec, options))
+    tests = [
+        _YamlTestCase(pipeline_spec, test_spec, options, known_args.fix_tests)
+        for test_spec in test_specs
+    ]
+    suite = unittest.TestSuite(tests)
     result = unittest.TextTestRunner().run(suite)
+
+  if known_args.fix_tests:
+    if known_args.test_suite:
+      path = known_args.test_suite
+    elif known_args.yaml_pipeline_file:
+      path = known_args.yaml_pipeline_file
+    else:
+      raise RuntimeError('Test fixing only supported for file-backed tests.')
+    with open(path) as fin:
+      original_yaml = fin.read()
+    if path == known_args.yaml_pipeline_file and pipeline_yaml == content:
+      raise RuntimeError('In-file test fixing not yet supported for templated pipelines.')
+    updated_spec = yaml.load(original_yaml, Loader=yaml.SafeLoader)
+
+    for ix, test in enumerate(tests):
+      if test.fixes:
+        test_spec = yaml_transform.SafeLineLoader.strip_metadata(test.spec())
+        assert test_spec == updated_spec['tests'][ix]
+        for (loc, name), values in test.fixes.items():
+          for expectation in updated_spec['tests'][ix][loc]:
+            if expectation['name'] == name:
+              expectation['elements'] = sorted(values, key=json.dumps)
+              break
+
+    updated_yaml = yaml_utils.patch_yaml(original_yaml, updated_spec)
+    with open(path, 'w') as fout:
+      fout.write(updated_yaml)
+
   if exit:
     # emulates unittest.main()
     sys.exit(0 if result.wasSuccessful() else 1)
@@ -233,14 +268,16 @@ def constructor(root):
 
 
 class _YamlTestCase(unittest.TestCase):
-  def __init__(self, pipeline_spec, test_spec, options):
+  def __init__(self, pipeline_spec, test_spec, options, fix_tests):
     super().__init__()
     self._pipeline_spec = pipeline_spec
     self._test_spec = test_spec
     self._options = options
+    self._fix_tests = fix_tests
 
   def runTest(self):
-    yaml_testing.run_test(self._pipeline_spec, self._test_spec, self._options)
+    self.fixes = yaml_testing.run_test(
+        self._pipeline_spec, self._test_spec, self._options, self._fix_tests)
 
   def id(self):
     return (
@@ -250,6 +287,9 @@ def id(self):
   def __str__(self):
     return self.id()
 
+  def spec(self):
+    return self._test_spec
+
 
 if __name__ == '__main__':
   import logging
diff --git a/sdks/python/apache_beam/yaml/main_test.py b/sdks/python/apache_beam/yaml/main_test.py
@@ -54,28 +54,28 @@
 
 PASSING_TEST_SUITE = '''
 tests:
-  - name: ExternalTest
+  - name: ExternalTest  # comment
     mock_outputs:
       - name: Create
         elements: ['a', 'b', 'c']
     expected_inputs:
       - name: WriteToText
         elements:
-          - {element: a}
-          - {element: b}
-          - {element: c}
+          - element: a
+          - element: b
+          - element: c
 '''
 
 FAILING_TEST_SUITE = '''
 tests:
-  - name: ExternalTest
+  - name: ExternalTest  # comment
     mock_outputs:
       - name: Create
         elements: ['a', 'b', 'c']
     expected_inputs:
       - name: WriteToText
         elements:
-          - {element: x}
+          - element: x
 '''
 
 
@@ -182,6 +182,23 @@ def test_external_test_specs(self):
         ],
                        exit=False)
 
+  def test_fix_suite(self):
+    with tempfile.TemporaryDirectory() as tmpdir:
+      test_suite = os.path.join(tmpdir, 'tests.yaml')
+      with open(test_suite, 'w') as fout:
+        fout.write(FAILING_TEST_SUITE)
+
+      main.run_tests([
+          '--yaml_pipeline',
+          TEST_PIPELINE,
+          '--test_suite',
+          test_suite,
+          '--fix_tests'
+      ],
+                     exit=False)
+
+      with open(test_suite) as fin:
+        self.assertEqual(fin.read(), PASSING_TEST_SUITE)
 
 if __name__ == '__main__':
   logging.getLogger().setLevel(logging.INFO)
diff --git a/sdks/python/apache_beam/yaml/yaml_testing.py b/sdks/python/apache_beam/yaml/yaml_testing.py
@@ -17,6 +17,7 @@
 
 import collections
 import functools
+import uuid
 from typing import Dict
 from typing import List
 from typing import Mapping
@@ -28,17 +29,21 @@
 import yaml
 
 import apache_beam as beam
+from apache_beam.testing.util import assert_that
+from apache_beam.testing.util import equal_to
+from apache_beam.yaml import yaml_provider
 from apache_beam.yaml import yaml_transform
 from apache_beam.yaml import yaml_utils
 
 
-def run_test(pipeline_spec, test_spec, options=None):
+def run_test(pipeline_spec, test_spec, options=None, fix_failures=False):
   if isinstance(pipeline_spec, str):
     pipeline_spec = yaml.load(pipeline_spec, Loader=yaml_utils.SafeLineLoader)
 
-  transform_spec = inject_test_tranforms(
+  transform_spec, recording_ids = inject_test_tranforms(
       yaml_transform.pipeline_as_composite(pipeline_spec['pipeline']),
-      test_spec)
+      test_spec,
+      fix_failures)
 
   allowed_sources = set(test_spec.get('allowed_sources', []) + ['Create'])
   for transform in transform_spec['transforms']:
@@ -57,7 +62,20 @@ def run_test(pipeline_spec, test_spec, options=None):
             pipeline_spec.get('options', {})))
 
   with beam.Pipeline(options=options) as p:
-    _ = p | yaml_transform.YamlTransform(transform_spec)
+    _ = p | yaml_transform.YamlTransform(
+        transform_spec,
+        providers={'AssertEqualAndRecord': AssertEqualAndRecord})
+
+  if fix_failures:
+    fixes = {}
+    for recording_id in recording_ids:
+      if AssertEqualAndRecord.has_recorded_result(recording_id):
+        fixes[recording_id[1:]] = [
+            row._asdict() if isinstance(row, beam.Row) else row
+            for row in AssertEqualAndRecord.get_recorded_result(recording_id)
+        ]
+        AssertEqualAndRecord.remove_recorded_result(recording_id)
+    return fixes
 
 
 def validate_test_spec(test_spec):
@@ -116,7 +134,7 @@ def validate_test_spec(test_spec):
             f'must be a list, got {type(attr_item["elements"])}')
 
 
-def inject_test_tranforms(spec, test_spec):
+def inject_test_tranforms(spec, test_spec, fix_failures):
   validate_test_spec(test_spec)
   # These are idempotent, so it's OK to do them preemptively.
   for phase in [
@@ -140,6 +158,9 @@ def inject_test_tranforms(spec, test_spec):
       for mock_output in test_spec.get('mock_outputs', [])
   })
 
+  recording_id_prefix = str(uuid.uuid4())
+  recording_ids = []
+
   transforms = []
 
   @functools.cache
@@ -213,38 +234,94 @@ def create_create(name, elements):
         },
     }
 
-  def create_assertion(name, inputs, elements):
+  def create_assertion(name, inputs, elements, recording_id=None):
     return {
         '__uuid__': yaml_utils.SafeLineLoader.create_uuid(),
         'name': name,
         'input': inputs,
-        'type': 'AssertEqual',
+        'type': 'AssertEqualAndRecord',
         'config': {
             'elements': elements,
+            'recording_id': recording_id,
         },
     }
 
   for expected_output in test_spec.get('expected_outputs', []):
+    if fix_failures:
+      recording_id = (
+          recording_id_prefix, 'expected_outputs', expected_output['name'])
+      recording_ids.append(recording_id)
+    else:
+      recording_id = None
     require_output(expected_output['name'])
     transforms.append(
         create_assertion(
             f'CheckExpectedOutput[{expected_output["name"]}]',
             expected_output['name'],
-            expected_output['elements']))
+            expected_output['elements'],
+            recording_id))
 
   for expected_input in test_spec.get('expected_inputs', []):
+    if fix_failures:
+      recording_id = (
+          recording_id_prefix, 'expected_inputs', expected_input['name'])
+      recording_ids.append(recording_id)
+    else:
+      recording_id = None
     transform_id = scope.get_transform_id(expected_input['name'])
     transforms.append(
         create_assertion(
             f'CheckExpectedInput[{expected_input["name"]}]',
             create_inputs(transform_id),
-            expected_input['elements']))
+            expected_input['elements'],
+            recording_id))
 
   return {
       '__uuid__': yaml_utils.SafeLineLoader.create_uuid(),
       'type': 'composite',
       'transforms': transforms,
-  }
+  }, recording_ids
+
+
+class AssertEqualAndRecord(beam.PTransform):
+  _recorded_results = {}
+
+  @classmethod
+  def store_recorded_result(cls, recording_id, value):
+    assert recording_id not in cls._recorded_results
+    cls._recorded_results[recording_id] = value
+
+  @classmethod
+  def has_recorded_result(cls, recording_id):
+    return recording_id in cls._recorded_results
+
+  @classmethod
+  def get_recorded_result(cls, recording_id):
+    return cls._recorded_results[recording_id]
+
+  @classmethod
+  def remove_recorded_result(cls, recording_id):
+    del cls._recorded_results[recording_id]
+
+  def __init__(self, elements, recording_id):
+    self._elements = elements
+    self._recording_id = recording_id
+
+  def expand(self, pcoll):
+    equal_to_matcher = equal_to(yaml_provider.dicts_to_rows(self._elements))
+
+    def matcher(actual):
+      try:
+        equal_to_matcher(actual)
+      except Exception:
+        if self._recording_id:
+          AssertEqualAndRecord.store_recorded_result(
+              tuple(self._recording_id), actual)
+        else:
+          raise
+
+    return assert_that(
+        pcoll | beam.Map(lambda row: beam.Row(**row._asdict())), matcher)
 
 
 K1 = TypeVar('K1')
diff --git a/sdks/python/apache_beam/yaml/yaml_testing_test.py b/sdks/python/apache_beam/yaml/yaml_testing_test.py
@@ -163,6 +163,35 @@ def test_unmocked_inputs(self):
               }]
           })
 
+  def test_fixes(self):
+    fixes = yaml_testing.run_test(
+        SIMPLE_PIPELINE,
+        {
+            'mock_outputs': [{
+                'name': 'MyRead',
+                'elements': [1, 2, 3],
+            }],
+            'expected_inputs': [{
+                'name': 'ToBeExcluded',
+                'elements': [
+                    {
+                        'element': 1, 'square': 1
+                    },
+                    {
+                        'element': 2, 'square': 4
+                    },
+                ]
+            }]
+        },
+        fix_failures=True)
+    self.assertEqual(
+        fixes,
+        {('expected_inputs', 'ToBeExcluded'): [
+             dict(element=1, square=1),
+             dict(element=2, square=4),
+             dict(element=3, square=9),
+         ]})
+
 
 if __name__ == '__main__':
   logging.getLogger().setLevel(logging.INFO)