|
19 | 19 | import unittest |
20 | 20 | from unittest.mock import MagicMock |
21 | 21 |
|
| 22 | +import pyarrow as pa |
| 23 | + |
22 | 24 | from pypaimon import CatalogFactory, Schema |
23 | 25 | from pypaimon.catalog.catalog_exception import (DatabaseAlreadyExistException, |
24 | 26 | DatabaseNotExistException, |
@@ -276,3 +278,162 @@ def test_get_database_propagates_exists_error(self): |
276 | 278 |
|
277 | 279 | # Restore original method |
278 | 280 | filesystem_catalog.file_io.exists = original_exists |
| 281 | + |
| 282 | + def _create_partitioned_table_with_data(self, catalog, identifier, partitions_data): |
| 283 | + """Helper to create a partitioned table and write data for each partition. |
| 284 | +
|
| 285 | + Args: |
| 286 | + catalog: The catalog instance. |
| 287 | + identifier: Table identifier string (e.g. 'test_db.tbl'). |
| 288 | + partitions_data: List of dicts, each with 'dt' and rows count. |
| 289 | + e.g. [{'dt': '2024-01-01', 'rows': 2}, {'dt': '2024-01-02', 'rows': 3}] |
| 290 | + """ |
| 291 | + pa_schema = pa.schema([ |
| 292 | + ('dt', pa.string()), |
| 293 | + ('col1', pa.int32()), |
| 294 | + ]) |
| 295 | + schema = Schema.from_pyarrow_schema(pa_schema, partition_keys=['dt']) |
| 296 | + catalog.create_table(identifier, schema, True) |
| 297 | + table = catalog.get_table(identifier) |
| 298 | + |
| 299 | + for part in partitions_data: |
| 300 | + write_builder = table.new_batch_write_builder() |
| 301 | + table_write = write_builder.new_write() |
| 302 | + table_commit = write_builder.new_commit() |
| 303 | + data = pa.Table.from_pydict({ |
| 304 | + 'dt': [part['dt']] * part['rows'], |
| 305 | + 'col1': list(range(part['rows'])), |
| 306 | + }, schema=pa_schema) |
| 307 | + table_write.write_arrow(data) |
| 308 | + table_commit.commit(table_write.prepare_commit()) |
| 309 | + table_write.close() |
| 310 | + table_commit.close() |
| 311 | + |
| 312 | + def test_list_partitions_paged(self): |
| 313 | + """Test list_partitions_paged with real data from manifest files.""" |
| 314 | + catalog = CatalogFactory.create({"warehouse": self.warehouse}) |
| 315 | + catalog.create_database("test_db", False) |
| 316 | + |
| 317 | + identifier = "test_db.part_tbl" |
| 318 | + self._create_partitioned_table_with_data(catalog, identifier, [ |
| 319 | + {'dt': '2024-01-03', 'rows': 3}, |
| 320 | + {'dt': '2024-01-01', 'rows': 2}, |
| 321 | + {'dt': '2024-01-02', 'rows': 5}, |
| 322 | + ]) |
| 323 | + |
| 324 | + # List all partitions |
| 325 | + result = catalog.list_partitions_paged(identifier) |
| 326 | + self.assertEqual(len(result.elements), 3) |
| 327 | + self.assertIsNone(result.next_page_token) |
| 328 | + |
| 329 | + # Verify partitions are sorted by spec |
| 330 | + specs = [p.spec['dt'] for p in result.elements] |
| 331 | + self.assertEqual(specs, sorted(specs)) |
| 332 | + |
| 333 | + # Verify aggregated statistics |
| 334 | + part_map = {p.spec['dt']: p for p in result.elements} |
| 335 | + self.assertEqual(part_map['2024-01-01'].record_count, 2) |
| 336 | + self.assertEqual(part_map['2024-01-02'].record_count, 5) |
| 337 | + self.assertEqual(part_map['2024-01-03'].record_count, 3) |
| 338 | + for p in result.elements: |
| 339 | + self.assertGreater(p.file_size_in_bytes, 0) |
| 340 | + self.assertGreater(p.file_count, 0) |
| 341 | + self.assertGreater(p.last_file_creation_time, 0) |
| 342 | + |
| 343 | + def test_list_partitions_paged_pagination(self): |
| 344 | + """Test list_partitions_paged pagination with max_results and page_token.""" |
| 345 | + catalog = CatalogFactory.create({"warehouse": self.warehouse}) |
| 346 | + catalog.create_database("test_db", False) |
| 347 | + |
| 348 | + identifier = "test_db.paged_tbl" |
| 349 | + self._create_partitioned_table_with_data(catalog, identifier, [ |
| 350 | + {'dt': '2024-01-01', 'rows': 1}, |
| 351 | + {'dt': '2024-01-02', 'rows': 1}, |
| 352 | + {'dt': '2024-01-03', 'rows': 1}, |
| 353 | + ]) |
| 354 | + |
| 355 | + # First page: max_results=2 |
| 356 | + page1 = catalog.list_partitions_paged(identifier, max_results=2) |
| 357 | + self.assertEqual(len(page1.elements), 2) |
| 358 | + self.assertIsNotNone(page1.next_page_token) |
| 359 | + |
| 360 | + # Second page: use next_page_token |
| 361 | + page2 = catalog.list_partitions_paged( |
| 362 | + identifier, max_results=2, page_token=page1.next_page_token |
| 363 | + ) |
| 364 | + self.assertEqual(len(page2.elements), 1) |
| 365 | + self.assertIsNone(page2.next_page_token) |
| 366 | + |
| 367 | + # All specs across pages should cover all 3 partitions |
| 368 | + all_specs = [p.spec['dt'] for p in page1.elements + page2.elements] |
| 369 | + self.assertEqual(sorted(all_specs), ['2024-01-01', '2024-01-02', '2024-01-03']) |
| 370 | + |
| 371 | + # max_results larger than total returns all |
| 372 | + result = catalog.list_partitions_paged(identifier, max_results=100) |
| 373 | + self.assertEqual(len(result.elements), 3) |
| 374 | + self.assertIsNone(result.next_page_token) |
| 375 | + |
| 376 | + def test_list_partitions_paged_pattern(self): |
| 377 | + """Test list_partitions_paged with partition_name_pattern filter.""" |
| 378 | + catalog = CatalogFactory.create({"warehouse": self.warehouse}) |
| 379 | + catalog.create_database("test_db", False) |
| 380 | + |
| 381 | + identifier = "test_db.pattern_tbl" |
| 382 | + self._create_partitioned_table_with_data(catalog, identifier, [ |
| 383 | + {'dt': '2024-01-01', 'rows': 1}, |
| 384 | + {'dt': '2024-02-01', 'rows': 1}, |
| 385 | + {'dt': '2024-02-15', 'rows': 1}, |
| 386 | + ]) |
| 387 | + |
| 388 | + # Exact match |
| 389 | + result = catalog.list_partitions_paged( |
| 390 | + identifier, partition_name_pattern='dt=2024-01-01' |
| 391 | + ) |
| 392 | + self.assertEqual(len(result.elements), 1) |
| 393 | + self.assertEqual(result.elements[0].spec['dt'], '2024-01-01') |
| 394 | + |
| 395 | + # Wildcard match |
| 396 | + result = catalog.list_partitions_paged( |
| 397 | + identifier, partition_name_pattern='dt=2024-02*' |
| 398 | + ) |
| 399 | + self.assertEqual(len(result.elements), 2) |
| 400 | + specs = sorted(p.spec['dt'] for p in result.elements) |
| 401 | + self.assertEqual(specs, ['2024-02-01', '2024-02-15']) |
| 402 | + |
| 403 | + # No match |
| 404 | + result = catalog.list_partitions_paged( |
| 405 | + identifier, partition_name_pattern='dt=2025*' |
| 406 | + ) |
| 407 | + self.assertEqual(len(result.elements), 0) |
| 408 | + |
| 409 | + def test_list_partitions_paged_empty(self): |
| 410 | + """Test list_partitions_paged on a table with no data.""" |
| 411 | + catalog = CatalogFactory.create({"warehouse": self.warehouse}) |
| 412 | + catalog.create_database("test_db", False) |
| 413 | + |
| 414 | + pa_schema = pa.schema([('dt', pa.string()), ('val', pa.int32())]) |
| 415 | + schema = Schema.from_pyarrow_schema(pa_schema, partition_keys=['dt']) |
| 416 | + catalog.create_table('test_db.empty_tbl', schema, False) |
| 417 | + |
| 418 | + result = catalog.list_partitions_paged('test_db.empty_tbl') |
| 419 | + self.assertEqual(len(result.elements), 0) |
| 420 | + self.assertIsNone(result.next_page_token) |
| 421 | + |
| 422 | + def test_list_partitions_paged_invalid_token(self): |
| 423 | + """Test list_partitions_paged with invalid page_token falls back to start.""" |
| 424 | + catalog = CatalogFactory.create({"warehouse": self.warehouse}) |
| 425 | + catalog.create_database("test_db", False) |
| 426 | + |
| 427 | + identifier = "test_db.token_tbl" |
| 428 | + self._create_partitioned_table_with_data(catalog, identifier, [ |
| 429 | + {'dt': '2024-01-01', 'rows': 1}, |
| 430 | + {'dt': '2024-01-02', 'rows': 1}, |
| 431 | + ]) |
| 432 | + |
| 433 | + # Invalid page_token should fall back to start |
| 434 | + result = catalog.list_partitions_paged( |
| 435 | + identifier, max_results=1, page_token='invalid' |
| 436 | + ) |
| 437 | + self.assertEqual(len(result.elements), 1) |
| 438 | + self.assertEqual(result.elements[0].spec['dt'], '2024-01-01') |
| 439 | + self.assertIsNotNone(result.next_page_token) |
0 commit comments