Skip to content

Spotty Integration with DuckLake #10

@nshankar

Description

@nshankar

Hi thank you for working on this, its a well-needed feature. Just wanted to point out that a very good use case for this extension is a Google Cloud hosted DuckLake, but that integration is currently not working. My repro:

Repro of Ducklake/GCS Extension Issue
# /// script
# requires-python = ">=3.12"
# dependencies = [
#   "duckdb>=1.4.3",
#   "polars",
#   "gcsfs",
#   "pyarrow"
# ]
# ///
"""
Minimal reproducible example for ducklake + GCS extension failing to work together.

This demonstrates that:
1. Writing to ducklake on GCS via fsspec works
2. Reading the same data via GCS extension with credential_chain fails

Setup:
1. Set GCS_BUCKET and GCS_METADATA_PATH below
2. Authenticate: gcloud auth application-default login
3. Run: uv run --script test_ducklake_gcs_integration.py
"""

import duckdb
import polars as pl

# ==================== CONFIGURATION ====================
GCS_BUCKET = "gs://your-bucket-name/ducklake-test" # CHANGE ME
GCS_METADATA_PATH = "/tmp/ducklake_metadata.db"
# =======================================================


def setup_ducklake_with_fsspec():
    """
    Step 1: Create ducklake table on GCS using fsspec backend (this works).
    """
    print("=" * 70)
    print("STEP 1: Creating ducklake table on GCS using fsspec backend")
    print("=" * 70)

    # Use fsspec backend
    conn = duckdb.connect()
    from fsspec import filesystem

    fs = filesystem("gcs")
    conn.register_filesystem(fs)
    
    # Load extensions
    conn.execute("INSTALL ducklake;")
    conn.execute("LOAD ducklake;")
    print("✓ Loaded ducklake extension")

    # Attach ducklake with GCS data path
    print(f"\nAttaching ducklake with metadata: {GCS_METADATA_PATH}")
    print(f"                      data path: {GCS_BUCKET}")

    conn.execute(f"""
        ATTACH 'ducklake:{GCS_METADATA_PATH}' AS dl_catalog (
            DATA_PATH '{GCS_BUCKET}'
        );
    """)
    conn.execute("USE dl_catalog.main;")
    print("✓ Attached ducklake")

    # Create test table
    print("\nCreating test table with sample data...")
    df = pl.DataFrame(
        {
            "id": [1, 2, 3, 4, 5],
            "name": ["alice", "bob", "charlie", "david", "eve"],
            "value": [10.5, 20.3, 30.1, 40.8, 50.2],
        }
    )

    # Drop if exists to allow multiple runs
    conn.execute("DROP TABLE IF EXISTS test_table;")
    conn.execute("CREATE TABLE test_table (id INTEGER, name VARCHAR, value DOUBLE);")
    conn.execute("INSERT INTO test_table SELECT * FROM df")

    result = conn.execute("SELECT COUNT(*) FROM test_table").fetchone()
    print(f"✓ Created table with {result[0]} rows")

    # Verify files were written to GCS
    files = conn.execute(
        "SELECT COUNT(*) FROM ducklake_list_files('dl_catalog', 'test_table')"
    ).fetchone()
    print(f"✓ Written {files[0]} file(s) to GCS\n")

    conn.close()
    return df


def read_ducklake_with_gcs_extension():
    """
    Step 2: Try to read the same ducklake table using GCS extension (expected to fail).
    """
    print("=" * 70)
    print("STEP 2: Reading ducklake table using GCS extension with credential_chain")
    print("=" * 70)

    conn = duckdb.connect()

    # Install GCS extension instead of using fsspec
    print("Installing GCS community extension...")
    conn.execute("INSTALL gcs FROM community;")
    conn.execute("LOAD gcs;")
    print("✓ GCS extension loaded")

    # Create secret with credential_chain
    print("\nCreating GCS secret with credential_chain...")
    conn.execute("""
        CREATE SECRET gcs_secret (
            TYPE GCP,
            PROVIDER credential_chain
        );""")
    print("✓ Secret created")

    # Load ducklake
    conn.execute("INSTALL ducklake;")
    conn.execute("LOAD ducklake;")
    print("✓ Loaded ducklake extension")

    # Try to attach and read
    print(f"                      data path: {GCS_BUCKET}")
    try:
        conn.execute(f"""
            ATTACH 'ducklake:{GCS_METADATA_PATH}' AS dl_catalog (
                DATA_PATH '{GCS_BUCKET}'
            );
        """)
        conn.execute("USE dl_catalog.main;")
        print("✓ Attached ducklake")

        print("\nAttempting: SELECT * FROM test_table")
        result = conn.execute("SELECT * FROM test_table").fetchall()
        print("✓ SUCCESS - Read {len(result)} rows!")
        print("First row:", result[0])
        return True

    except Exception as e:
        print("✗ FAILED")
        print(f"Error type: {type(e).__name__}")
        print(f"Error message: {e}")
        print("\nThis demonstrates that credential_chain does not work")
        print("for reading ducklake tables from GCS.")
        return False
    finally:
        conn.close()


def verify_direct_parquet_read():
    """
    Step 3: Verify that direct parquet reads work (sanity check).
    """
    print("\n" + "=" * 70)
    print("STEP 3: Sanity check - direct parquet read with GCS extension")
    print("=" * 70)

    conn = duckdb.connect()

    conn.execute("INSTALL gcs FROM community;")
    conn.execute("LOAD gcs;")

    secret_sql = """
    CREATE SECRET gcs_secret (
        TYPE GCP,
        PROVIDER credential_chain
    );
    """
    conn.execute(secret_sql)

    print(f"\nAttempting to list files in: {GCS_BUCKET}/*.parquet")
    try:
        # Try to read any parquet file from the bucket
        result = conn.execute(
            f"SELECT COUNT(*) FROM parquet_scan('{GCS_BUCKET}/**/*.parquet')"
        ).fetchone()
        print(f"✓ Can read parquet files directly: found {result[0]} rows")
        return True
    except Exception as e:
        print(f"✗ Direct parquet read also failed: {e}")
        print("This might indicate a broader credential issue.")
        return False
    finally:
        conn.close()


def main():
    if GCS_BUCKET == "gs://your-bucket-name/ducklake-test":
        print("\n⚠️  ERROR: Please configure GCS_BUCKET first!")
        print("Edit the GCS_BUCKET variable at the top of this file.\n")
        return

    print("\nDucklake + GCS Extension Credential Test\n")

    setup_ducklake_with_fsspec()
    gcs_ext_success = read_ducklake_with_gcs_extension()
    direct_read_success = verify_direct_parquet_read()

    # Summary
    print("\n" + "=" * 70)
    print("SUMMARY")
    print("=" * 70)
    print("✓ fsspec write to ducklake:  SUCCESS")
    print(
        f"{'✓' if gcs_ext_success else '✗'} GCS extension read ducklake: {'SUCCESS' if gcs_ext_success else 'FAILED'}"
    )
    print(
        f"{'✓' if direct_read_success else '✗'} GCS extension read parquet: {'SUCCESS' if direct_read_success else 'FAILED'}"
    )

    if not gcs_ext_success and direct_read_success:
        print("\n→ Issue confirmed: credential_chain works for direct reads")
        print("  but not for ducklake table reads.")
    elif not gcs_ext_success and not direct_read_success:
        print("\n→ Both failed: might be a credential configuration issue.")

    # Cleanup suggestion
    print(f"\nCleanup: You may want to delete {GCS_METADATA_PATH} and {GCS_BUCKET}")


if __name__ == "__main__":
    main()

For me the repro gives

Result
❯ uv run --script test_ducklake_gcs_issue.py

Ducklake + GCS Extension Credential Test

======================================================================
STEP 1: Creating ducklake table on GCS using fsspec backend
======================================================================
✓ Loaded ducklake extension

Attaching ducklake with metadata: /tmp/ducklake_metadata.db
                      data path: gs://silurian-temporary/ducklake-test
✓ Attached ducklake

Creating test table with sample data...
✓ Created table with 5 rows
✓ Written 1 file(s) to GCS

======================================================================
STEP 2: Reading ducklake table using GCS extension with credential_chain
======================================================================
Installing GCS community extension...
✓ GCS extension loaded

Creating GCS secret with credential_chain...
✓ Secret created
✓ Loaded ducklake extension
                      data path: gs://silurian-temporary/ducklake-test
✓ Attached ducklake

Attempting: SELECT * FROM test_table
✗ FAILED
Error type: HTTPException
Error message: HTTP Error: HTTP GET error on 'https://storage.googleapis.com/silurian-temporary/ducklake-test/main/test_table/ducklake-019b3028-38b0-7bb9-9e36-4b708ae8cd36.parquet' (HTTP 403)

This demonstrates that credential_chain does not work
for reading ducklake tables from GCS.

======================================================================
STEP 3: Sanity check - direct parquet read with GCS extension
======================================================================

Attempting to list files in: gs://silurian-temporary/ducklake-test/*.parquet
✓ Can read parquet files directly: found 20 rows

======================================================================
SUMMARY
======================================================================
✓ fsspec write to ducklake:  SUCCESS
✗ GCS extension read ducklake: FAILED
✓ GCS extension read parquet: SUCCESS

→ Issue confirmed: credential_chain works for direct reads
  but not for ducklake table reads.

Cleanup: You may want to delete /tmp/ducklake_metadata.db and gs://silurian-temporary/ducklake-test

Metadata

Metadata

Assignees

Labels

No labels
No labels

Type

No type
No fields configured for issues without a type.

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions