Skip to content
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ A DataFusion extension for querying [DuckLake](https://ducklake.select). DuckLak
- Parquet footer size hints for optimized I/O
- Filter pushdown to Parquet for row group pruning and page-level filtering
- Dynamic metadata lookup (no upfront catalog caching)
- SQL-queryable `information_schema` for catalog metadata (snapshots, schemas, tables, columns, files)
- DuckDB-style table functions: `ducklake_snapshots()`, `ducklake_table_info()`, `ducklake_list_files()`

## Known Limitations

Expand Down
13 changes: 11 additions & 2 deletions examples/basic_query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

use datafusion::execution::runtime_env::RuntimeEnv;
use datafusion::prelude::*;
use datafusion_ducklake::{DuckLakeCatalog, DuckdbMetadataProvider};
use datafusion_ducklake::{DuckLakeCatalog, DuckdbMetadataProvider, register_ducklake_functions};
use object_store::ObjectStore;
use object_store::aws::AmazonS3Builder;
use std::env;
Expand Down Expand Up @@ -66,10 +66,19 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
// Create DataFusion session context
let ctx = SessionContext::new_with_config_rt(config, runtime.clone());

// Register the DuckLake catalog
// Get the provider before moving the catalog
let provider = ducklake_catalog.provider();

// Register the DuckLake catalog (standard DataFusion pattern)
ctx.register_catalog("ducklake", Arc::new(ducklake_catalog));

// Register table functions (ducklake_snapshots, ducklake_table_info, ducklake_list_files)
register_ducklake_functions(&ctx, provider);

println!("✓ Registered DuckLake catalog with DataFusion");
println!(
"✓ Registered table functions (ducklake_snapshots, ducklake_table_info, ducklake_list_files)"
);

// List available schemas
let catalogs = ctx.catalog_names();
Expand Down
33 changes: 27 additions & 6 deletions src/catalog.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use std::any::Any;
use std::sync::Arc;

use crate::Result;
use crate::information_schema::InformationSchemaProvider;
use crate::metadata_provider::MetadataProvider;
use crate::path_resolver::parse_object_store_url;
use crate::schema::DuckLakeSchema;
Expand Down Expand Up @@ -38,6 +39,13 @@ impl DuckLakeCatalog {
})
}

/// Get the metadata provider for this catalog
///
/// This is useful when you need to register table functions separately.
pub fn provider(&self) -> Arc<dyn MetadataProvider> {
self.provider.clone()
}

fn get_current_snapshot_id(&self) -> Result<i64> {
self.provider
.get_current_snapshot()
Expand All @@ -53,25 +61,38 @@ impl CatalogProvider for DuckLakeCatalog {
fn schema_names(&self) -> Vec<String> {
let snapshot_id = match self.get_current_snapshot_id() {
Ok(id) => id,
Err(_) => return Vec::new(),
Err(_) => return vec!["information_schema".to_string()],
Comment thread
shefeek-jinnah marked this conversation as resolved.
Outdated
};

// Query database with snapshot_id
self.provider
// Start with information_schema
let mut names = vec!["information_schema".to_string()];

// Add data schemas from catalog
let data_schemas = self
.provider
.list_schemas(snapshot_id)
.unwrap_or_default()
.into_iter()
.map(|s| s.schema_name)
.collect()
.map(|s| s.schema_name);

names.extend(data_schemas);
names
Comment thread
shefeek-jinnah marked this conversation as resolved.
}

fn schema(&self, name: &str) -> Option<Arc<dyn SchemaProvider>> {
// Handle information_schema specially
if name == "information_schema" {
return Some(Arc::new(InformationSchemaProvider::new(Arc::clone(
&self.provider,
))));
}

let snapshot_id = match self.get_current_snapshot_id() {
Ok(id) => id,
Err(_) => return None,
};

// Query database with snapshot_id
// Query database with snapshot_id for data schemas
match self.provider.get_schema_by_name(name, snapshot_id) {
Ok(Some(meta)) => {
// Resolve schema path hierarchically
Expand Down
Loading
Loading