Mark distributed tests as integration to skip github actions workflow

yiqinzhu · yiqinzhu · commit 688021354e11 · 2025-11-18T09:51:18.000-08:00
diff --git a/deltacat/compute/converter/converter_session.py b/deltacat/compute/converter/converter_session.py
@@ -116,9 +116,13 @@ def converter_session(
         params.position_delete_for_multiple_data_files
     )
 
+    logger.info(f"Fetching all bucket files for table {table_identifier}...")
     data_file_dict, equality_delete_dict, pos_delete_dict = fetch_all_bucket_files(
         table=iceberg_table
     )
+    logger.info(
+        f"Fetched files - data: {len(data_file_dict)}, equality_delete: {len(equality_delete_dict)}, pos_delete: {len(pos_delete_dict)}"
+    )
 
     convert_input_files_for_all_buckets = group_all_files_to_each_bucket(
         data_file_dict=data_file_dict,
@@ -181,7 +185,9 @@ def convert_input_provider(index: int, item: Any) -> Dict[str, ConvertInput]:
             )
         }
 
-    logger.info(f"Getting remote convert tasks...")
+    logger.info(f"Creating {len(convert_input_files_for_all_buckets)} convert tasks...")
+    logger.info(f"Task max parallelism: {task_max_parallelism}")
+
     # Ray remote task: convert
     # TODO: Add split mechanism to split large buckets
     convert_tasks_pending = invoke_parallel(
@@ -193,10 +199,12 @@ def convert_input_provider(index: int, item: Any) -> Dict[str, ConvertInput]:
     )
 
     to_be_deleted_files_list: List[DataFile] = []
-    logger.info(f"Finished invoking {len(convert_tasks_pending)} convert tasks.")
+    logger.info(
+        f"Finished invoking {len(convert_tasks_pending)} convert tasks, waiting for results..."
+    )
 
     convert_results: List[ConvertResult] = ray.get(convert_tasks_pending)
-    logger.info(f"Got {len(convert_tasks_pending)} convert tasks.")
+    logger.info(f"Got {len(convert_tasks_pending)} convert task results.")
 
     total_position_delete_record_count = sum(
         convert_result.position_delete_record_count
diff --git a/deltacat/compute/converter/pyiceberg/overrides.py b/deltacat/compute/converter/pyiceberg/overrides.py
@@ -259,7 +259,7 @@ def fetch_all_bucket_files(
     # and collect their partition values
     target_partition_values = set()
     all_partition_values = []
-    all_table_partition_values = []
+
     for manifest_entry in chain(
         *executor.map(
             lambda args: _open_manifest(*args),
diff --git a/deltacat/experimental/converter_agent/table_monitor.py b/deltacat/experimental/converter_agent/table_monitor.py
@@ -421,6 +421,7 @@ def monitor_table(
                         params=converter_params
                     )
                     conversion_end_time = time.time_ns()  # Nanosecond precision
+                    logger.info(f"Converter session completed successfully")
 
                     logger.info(f"Converter session completed successfully")
                     current_snapshot_id = snapshot_id
diff --git a/deltacat/tests/compute/converter/integration/conftest.py b/deltacat/tests/compute/converter/integration/conftest.py
@@ -75,6 +75,12 @@ def session_catalog() -> Catalog:
 
 @pytest.fixture(autouse=True, scope="module")
 def setup_ray_cluster():
-    ray.init(local_mode=True, ignore_reinit_error=True)
+    ray.init(
+        local_mode=True,
+        ignore_reinit_error=True,
+        resources={
+            "convert_task": 10
+        },  # Provide convert_task resource for converter session
+    )
     yield
     ray.shutdown()
diff --git a/deltacat/tests/compute/converter/integration/test_convert_session.py b/deltacat/tests/compute/converter/integration/test_convert_session.py
@@ -57,11 +57,11 @@
 
 
 @pytest.fixture(scope="session")
-def daft_native_runner_session():
+def daft_native_runner():
     """
     Session-scoped fixture to set Daft to use native runner for converter integration tests.
     This is set once per test session and cannot be changed (Daft limitation).
-    Only applied to tests that explicitly request daft_native_runner.
+    Tests that need the native runner should explicitly request this fixture.
     """
     # Set to native runner only when explicitly requested
     # Note: Daft only allows setting runner once per session
@@ -76,17 +76,6 @@ def daft_native_runner_session():
     # No teardown needed - Daft doesn't allow changing runner after it's set
 
 
-@pytest.fixture
-def daft_native_runner(daft_native_runner_session):
-    """
-    Per-test fixture that depends on the session-scoped runner setup.
-    This ensures tests get the native runner without trying to change it.
-    Tests must explicitly request this fixture to use the native runner.
-    """
-    # Just yield - the actual setup is done by the session fixture
-    yield
-
-
 # Test data fixtures
 @pytest.fixture
 def base_schema():
diff --git a/deltacat/tests/compute/converter/integration/test_converter_commit_conflict_resolution.py b/deltacat/tests/compute/converter/integration/test_converter_commit_conflict_resolution.py
@@ -40,6 +40,7 @@
 # Task memory in bytes for testing
 TASK_MEMORY_BYTES = BASE_MEMORY_BUFFER
 
+
 # Test data fixtures
 @pytest.fixture
 def base_schema():
diff --git a/deltacat/tests/experimental/converter_agent/conftest.py b/deltacat/tests/experimental/converter_agent/conftest.py
@@ -1,10 +1,37 @@
 import pytest
 import ray
+import daft
 
 
 @pytest.fixture(autouse=True, scope="module")
 def setup_ray_cluster():
     """Set up Ray cluster for table monitor tests."""
-    ray.init(local_mode=True, ignore_reinit_error=True)
+    ray.init(
+        local_mode=True,
+        ignore_reinit_error=True,
+        resources={
+            "convert_task": 10
+        },  # Provide convert_task resource for converter session
+    )
     yield
     ray.shutdown()
+
+
+@pytest.fixture(scope="session")
+def daft_native_runner():
+    """
+    Session-scoped fixture to set Daft to use native runner for table monitor tests.
+    This is set once per test session and cannot be changed (Daft limitation).
+    Tests that need the native runner should explicitly request this fixture.
+    """
+    # Set to native runner only when explicitly requested
+    # Note: Daft only allows setting runner once per session
+    try:
+        daft.context.set_runner_native()
+    except Exception as e:
+        # If runner is already set, that's okay - just log it
+        print(f"Note: Daft runner already set, continuing with existing runner: {e}")
+
+    yield
+
+    # No teardown needed - Daft doesn't allow changing runner after it's set
diff --git a/deltacat/tests/experimental/converter_agent/test_table_monitor.py b/deltacat/tests/experimental/converter_agent/test_table_monitor.py
@@ -322,7 +322,9 @@ def test_stage_in_context(self):
 class TestTableMonitorEndToEnd:
     """End-to-end integration test for table monitor."""
 
-    def test_table_monitor_with_shared_catalog(self, setup_ray_cluster):
+    def test_table_monitor_with_shared_catalog(
+        self, setup_ray_cluster, daft_native_runner
+    ):
         """
         Test that table monitor automatically detects and converts data using a shared catalog.
 
diff --git a/deltacat/tests/storage/main/test_main_storage.py b/deltacat/tests/storage/main/test_main_storage.py
@@ -9,7 +9,6 @@
 import polars as pl
 import numpy as np
 import ray
-import daft
 import ray.data
 
 from deltacat import PartitionKey, PartitionScheme
@@ -7629,12 +7628,10 @@ def test_download_delta_distributed_error_handling(self):
             )
 
     # ========== DAFT DISTRIBUTED TESTS ==========
-
+    @pytest.mark.integration
     def test_download_delta_distributed_daft_basic(self):
         """Test basic distributed download with DAFT dataset type."""
 
-        daft.context.set_runner_ray()
-
         # Create test data
         test_data = pd.DataFrame(
             {
@@ -7688,13 +7685,10 @@ def test_download_delta_distributed_daft_basic(self):
         ), "Column names mismatch"
         pd.testing.assert_frame_equal(downloaded_df, expected_df)
 
+    @pytest.mark.integration
     def test_download_delta_distributed_daft_with_delta_locator(self):
         """Test DAFT distributed download using DeltaLocator instead of Delta object."""
 
-        if ray.is_initialized():
-            ray.shutdown()
-            ray.init()
-
         test_data = pd.DataFrame(
             {
                 "id": [12101, 12102, 12103],
@@ -7731,13 +7725,10 @@ def test_download_delta_distributed_daft_with_delta_locator(self):
         expected_df = test_data.sort_values("id").reset_index(drop=True)
         pd.testing.assert_frame_equal(downloaded_df, expected_df)
 
+    @pytest.mark.integration
     def test_download_delta_distributed_daft_vs_ray_consistency(self):
         """Test that DAFT and Ray distributed downloads return the same data."""
 
-        if ray.is_initialized():
-            ray.shutdown()
-            ray.init()
-
         test_data = pd.DataFrame(
             {
                 "id": [12501, 12502, 12503, 12504],
diff --git a/deltacat/tests/utils/test_daft.py b/deltacat/tests/utils/test_daft.py
@@ -1,5 +1,5 @@
 import unittest
-import ray
+import pytest
 from deltacat.types.media import ContentEncoding, ContentType
 from deltacat.utils.daft import (
     daft_file_to_pyarrow_table,
@@ -162,12 +162,11 @@ def test_read_from_local_single_column_with_row_groups(self):
         self.assertEqual(table.num_rows, 10)
 
 
+@pytest.mark.integration
 class TestFilesToDataFrame(unittest.TestCase):
     MVP_PATH = "deltacat/tests/utils/data/mvp.parquet"
 
     def test_read_local_files_all_columns(self):
-        if not ray.is_initialized():
-            ray.init()
         df = files_to_dataframe(
             uris=[self.MVP_PATH],
             content_encoding=ContentEncoding.IDENTITY.value,
@@ -180,8 +179,6 @@ def test_read_local_files_all_columns(self):
         self.assertEqual(table.num_rows, 100)
 
     def test_read_local_files_with_column_selection(self):
-        if not ray.is_initialized():
-            ray.init()
         df = files_to_dataframe(
             uris=[self.MVP_PATH],
             content_encoding=ContentEncoding.IDENTITY.value,
@@ -195,8 +192,6 @@ def test_read_local_files_with_column_selection(self):
         self.assertEqual(table.num_rows, 100)
 
     def test_read_local_files_does_not_materialize_by_default(self):
-        if not ray.is_initialized():
-            ray.init()
         df = files_to_dataframe(
             uris=[self.MVP_PATH],
             content_encoding=ContentEncoding.IDENTITY.value,
@@ -212,8 +207,6 @@ def test_read_local_files_does_not_materialize_by_default(self):
         self.assertEqual(len(df), 100)
 
     def test_supports_unescaped_tsv_content_type(self):
-        if not ray.is_initialized():
-            ray.init()
         # Test that UNESCAPED_TSV is now supported (was previously unsupported)
         # Use a CSV file since we're testing TSV reader functionality
         csv_path = "deltacat/tests/utils/data/non_empty_valid.csv"
@@ -230,8 +223,6 @@ def test_supports_unescaped_tsv_content_type(self):
         self.assertGreater(len(table.schema.names), 0)
 
     def test_supports_gzip_content_encoding(self):
-        if not ray.is_initialized():
-            ray.init()
         # Test that GZIP encoding is now supported (was previously unsupported)
         df = files_to_dataframe(
             uris=[self.MVP_PATH],
@@ -245,8 +236,6 @@ def test_supports_gzip_content_encoding(self):
         self.assertEqual(table.num_rows, 100)
 
     def test_raises_error_if_not_supported_content_type(self):
-        if not ray.is_initialized():
-            ray.init()
         # Test that truly unsupported content types raise NotImplementedError
         self.assertRaises(
             NotImplementedError,
@@ -259,8 +248,6 @@ def test_raises_error_if_not_supported_content_type(self):
         )
 
     def test_raises_error_if_not_supported_content_encoding(self):
-        if not ray.is_initialized():
-            ray.init()
         # Test that truly unsupported content encodings raise NotImplementedError
         self.assertRaises(
             NotImplementedError,
@@ -273,8 +260,6 @@ def test_raises_error_if_not_supported_content_encoding(self):
         )
 
     def test_accepts_custom_kwargs(self):
-        if not ray.is_initialized():
-            ray.init()
         # Test that custom kwargs are passed through to daft.read_parquet
         df = files_to_dataframe(
             uris=[self.MVP_PATH],
@@ -290,8 +275,6 @@ def test_accepts_custom_kwargs(self):
         self.assertEqual(table.num_rows, 100)
 
     def test_accepts_io_config(self):
-        if not ray.is_initialized():
-            ray.init()
         # Test that io_config parameter is accepted and passed correctly
         df = files_to_dataframe(
             uris=[self.MVP_PATH],

Original file line number	Diff line number	Diff line change
`@@ -421,6 +421,7 @@ def monitor_table(`
`421`	`421`	`params=converter_params`
`422`	`422`	`)`
`423`	`423`	`conversion_end_time = time.time_ns() # Nanosecond precision`
	`424`	`+ logger.info(f"Converter session completed successfully")`
`424`	`425`
`425`	`426`	`logger.info(f"Converter session completed successfully")`
`426`	`427`	`current_snapshot_id = snapshot_id`