From a34fa73de6f0bc92f54e2a2a11ed8a93ef427c99 Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Thu, 7 Aug 2025 13:26:45 -0400 Subject: [PATCH 1/2] Work toward wasm-wasi compat --- Cargo.lock | 3 + icechunk/Cargo.toml | 40 +- icechunk/src/config.rs | 2 + icechunk/src/lib.rs | 8 +- icechunk/src/storage/errors.rs | 79 + icechunk/src/storage/implementations/mod.rs | 10 + .../storage/implementations/object_store.rs | 1297 +++++++++++++++++ icechunk/src/storage/implementations/s3.rs | 1144 +++++++++++++++ icechunk/src/storage/mod.rs | 854 ++--------- icechunk/src/storage/traits.rs | 352 +++++ icechunk/src/storage/types.rs | 329 +++++ icechunk/src/virtual_chunks.rs | 165 ++- 12 files changed, 3490 insertions(+), 793 deletions(-) create mode 100644 icechunk/src/storage/errors.rs create mode 100644 icechunk/src/storage/implementations/mod.rs create mode 100644 icechunk/src/storage/implementations/object_store.rs create mode 100644 icechunk/src/storage/implementations/s3.rs create mode 100644 icechunk/src/storage/traits.rs create mode 100644 icechunk/src/storage/types.rs diff --git a/Cargo.lock b/Cargo.lock index c403d039c..9b6c0e85e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1374,8 +1374,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" dependencies = [ "cfg-if", + "js-sys", "libc", "wasi 0.11.0+wasi-snapshot-preview1", + "wasm-bindgen", ] [[package]] @@ -1735,6 +1737,7 @@ dependencies = [ "err-into", "flatbuffers", "futures", + "getrandom 0.2.15", "icechunk-macros", "itertools", "object_store", diff --git a/icechunk/Cargo.toml b/icechunk/Cargo.toml index 0c12f36d8..714ef0316 100644 --- a/icechunk/Cargo.toml +++ b/icechunk/Cargo.toml @@ -17,21 +17,13 @@ async-trait = "0.1.88" bytes = { version = "1.10.1", features = ["serde"] } base64 = "0.22.1" futures = "0.3.31" +getrandom = {version = "0.2.15", features = ["js"]} itertools = "0.14.0" -object_store = { version = "0.12.3", features = [ - "aws", - "gcp", - "azure", - "http", -] } rand = "0.9.2" thiserror = "2.0.12" serde_json = "1.0.142" serde = { version = "1.0.219", features = ["derive", "rc"] } serde_with = { version = "3.14.0", features = ["hex"] } -tokio = { version = "1.47.1", features = ["rt-multi-thread", "macros"] } -test-strategy = "0.4.3" -proptest = "1.7.0" quick_cache = "0.6.16" base32 = "0.5.1" chrono = { version = "0.4.41", features = ["serde"] } @@ -40,17 +32,9 @@ rmp-serde = "1.3.0" url = "2.5.4" async-stream = "0.3.6" rmpv = { version = "1.3.0", features = ["serde", "with-serde"] } -aws-sdk-s3 = "=1.78.0" -aws-config = "=1.5.18" -aws-credential-types = "1.2.4" typed-path = "0.11.0" -aws-smithy-types-convert = { version = "0.60.9", features = [ - "convert-chrono", - "convert-streams", -] } typetag = "0.2.20" zstd = "0.13.3" -tokio-util = { version = "0.7.16", features = ["compat", "io-util"] } serde_bytes = "0.11.17" regex = "1.11.1" tracing-error = "0.2.1" @@ -67,9 +51,31 @@ dirs = { version = "6.0.0", optional = true } assert_fs = { version = "1.1.3", optional = true } flatbuffers = "25.2.10" +[target.'cfg(not(target_arch = "wasm32"))'.dependencies] +aws-sdk-s3 = "=1.78.0" +aws-config = "=1.5.18" +aws-credential-types = "1.2.4" +aws-smithy-types-convert = { version = "0.60.9", features = [ + "convert-chrono", + "convert-streams", +] } +object_store = { version = "0.12.3", features = [ + "aws", + "gcp", + "azure", + "http", +] } +tokio = { version = "1.47.1", features = ["rt-multi-thread", "macros"] } +tokio-util = { version = "0.7.16", features = ["compat", "io-util"] } + +[target.'cfg(target_arch = "wasm32")'.dependencies] +tokio = { version = "1.47.1", features = ["io-util", "rt", "macros", "sync"] } + [dev-dependencies] icechunk-macros = { path = "../icechunk-macros", version = "0.1.0" } pretty_assertions = "1.4.1" +test-strategy = "0.4.3" +proptest = "1.7.0" proptest-state-machine = "0.4.0" tempfile = "3.20.0" test-log = { version = "0.2.18", default-features = false, features = [ diff --git a/icechunk/src/config.rs b/icechunk/src/config.rs index 89e3d00f2..61eba8376 100644 --- a/icechunk/src/config.rs +++ b/icechunk/src/config.rs @@ -9,6 +9,7 @@ use std::{ use async_trait::async_trait; use chrono::{DateTime, Utc}; use itertools::Either; +#[cfg(not(target_arch = "wasm32"))] pub use object_store::gcp::GcpCredential; use regex::bytes::Regex; use serde::{Deserialize, Serialize}; @@ -519,6 +520,7 @@ pub struct GcsBearerCredential { pub expires_after: Option>, } +#[cfg(not(target_arch = "wasm32"))] impl From<&GcsBearerCredential> for GcpCredential { fn from(value: &GcsBearerCredential) -> Self { GcpCredential { bearer: value.bearer.clone() } diff --git a/icechunk/src/lib.rs b/icechunk/src/lib.rs index 31aef15f6..0d58be597 100644 --- a/icechunk/src/lib.rs +++ b/icechunk/src/lib.rs @@ -38,10 +38,10 @@ pub mod virtual_chunks; pub use config::{ObjectStoreConfig, RepositoryConfig}; pub use repository::Repository; -pub use storage::{ - ObjectStorage, Storage, StorageError, new_in_memory_storage, - new_local_filesystem_storage, new_s3_storage, -}; +pub use storage::{Storage, StorageError, new_in_memory_storage}; + +#[cfg(not(target_arch = "wasm32"))] +pub use storage::{ObjectStorage, new_local_filesystem_storage, new_s3_storage}; pub use store::Store; mod private { diff --git a/icechunk/src/storage/errors.rs b/icechunk/src/storage/errors.rs new file mode 100644 index 000000000..256ac4c2d --- /dev/null +++ b/icechunk/src/storage/errors.rs @@ -0,0 +1,79 @@ +use std::ffi::OsString; +use thiserror::Error; + +use crate::error::ICError; + +#[cfg(not(target_arch = "wasm32"))] +use aws_sdk_s3::{ + config::http::HttpResponse, + error::SdkError, + operation::{ + complete_multipart_upload::CompleteMultipartUploadError, + create_multipart_upload::CreateMultipartUploadError, + delete_objects::DeleteObjectsError, get_object::GetObjectError, + head_object::HeadObjectError, list_objects_v2::ListObjectsV2Error, + put_object::PutObjectError, upload_part::UploadPartError, + }, + primitives::ByteStreamError, +}; + +#[derive(Debug, Error)] +pub enum StorageErrorKind { + #[cfg(not(target_arch = "wasm32"))] + #[error("object store error {0}")] + ObjectStore(#[from] Box<::object_store::Error>), + #[error("bad object store prefix {0:?}")] + BadPrefix(OsString), + #[cfg(not(target_arch = "wasm32"))] + #[error("error getting object from object store {0}")] + S3GetObjectError(#[from] Box>), + #[cfg(not(target_arch = "wasm32"))] + #[error("error writing object to object store {0}")] + S3PutObjectError(#[from] Box>), + #[cfg(not(target_arch = "wasm32"))] + #[error("error creating multipart upload {0}")] + S3CreateMultipartUploadError( + #[from] Box>, + ), + #[cfg(not(target_arch = "wasm32"))] + #[error("error uploading multipart part {0}")] + S3UploadPartError(#[from] Box>), + #[cfg(not(target_arch = "wasm32"))] + #[error("error completing multipart upload {0}")] + S3CompleteMultipartUploadError( + #[from] Box>, + ), + #[cfg(not(target_arch = "wasm32"))] + #[error("error getting object metadata from object store {0}")] + S3HeadObjectError(#[from] Box>), + #[cfg(not(target_arch = "wasm32"))] + #[error("error listing objects in object store {0}")] + S3ListObjectError(#[from] Box>), + #[cfg(not(target_arch = "wasm32"))] + #[error("error deleting objects in object store {0}")] + S3DeleteObjectError(#[from] Box>), + #[cfg(not(target_arch = "wasm32"))] + #[error("error streaming bytes from object store {0}")] + S3StreamError(#[from] Box), + #[error("I/O error: {0}")] + IOError(#[from] std::io::Error), + #[error("storage configuration error: {0}")] + R2ConfigurationError(String), + #[error("storage error: {0}")] + Other(String), +} + +pub type StorageError = ICError; + +// it would be great to define this impl in error.rs, but it conflicts with the blanket +// `impl From for T` +impl From for StorageError +where + E: Into, +{ + fn from(value: E) -> Self { + Self::new(value.into()) + } +} + +pub type StorageResult = Result; diff --git a/icechunk/src/storage/implementations/mod.rs b/icechunk/src/storage/implementations/mod.rs new file mode 100644 index 000000000..ed9a59f17 --- /dev/null +++ b/icechunk/src/storage/implementations/mod.rs @@ -0,0 +1,10 @@ +#[cfg(not(target_arch = "wasm32"))] +pub mod object_store; +#[cfg(not(target_arch = "wasm32"))] +pub mod s3; + +// Re-export implementations conditionally +#[cfg(not(target_arch = "wasm32"))] +pub use object_store::ObjectStorage; +#[cfg(not(target_arch = "wasm32"))] +pub use s3::S3Storage; diff --git a/icechunk/src/storage/implementations/object_store.rs b/icechunk/src/storage/implementations/object_store.rs new file mode 100644 index 000000000..7e3d6a231 --- /dev/null +++ b/icechunk/src/storage/implementations/object_store.rs @@ -0,0 +1,1297 @@ +use crate::{ + config::{ + AzureCredentials, AzureStaticCredentials, GcsBearerCredential, GcsCredentials, + GcsCredentialsFetcher, GcsStaticCredentials, S3Credentials, S3Options, + }, + format::{ChunkId, ChunkOffset, FileTypeTag, ManifestId, ObjectId, SnapshotId}, + private, +}; +use async_trait::async_trait; +use bytes::{Buf, Bytes}; +use chrono::{DateTime, TimeDelta, Utc}; +use futures::{ + StreamExt, TryStreamExt, + stream::{self, BoxStream}, +}; +use object_store::{ + Attribute, AttributeValue, Attributes, BackoffConfig, ClientConfigKey, + CredentialProvider, GetOptions, ObjectMeta, ObjectStore, PutMode, PutOptions, + PutPayload, RetryConfig, StaticCredentialProvider, UpdateVersion, + aws::AmazonS3Builder, + azure::{AzureConfigKey, MicrosoftAzureBuilder}, + gcp::{GcpCredential, GoogleCloudStorageBuilder, GoogleConfigKey}, + http::HttpBuilder, + local::LocalFileSystem, + memory::InMemory, + path::Path as ObjectPath, +}; +use serde::{Deserialize, Serialize}; +use std::{ + collections::HashMap, + fmt::{self, Debug, Display}, + fs::create_dir_all, + future::ready, + num::{NonZeroU16, NonZeroU64}, + ops::Range, + path::{Path as StdPath, PathBuf}, + sync::Arc, +}; +use tokio::{ + io::AsyncRead, + sync::{OnceCell, RwLock}, +}; +use tokio_util::compat::FuturesAsyncReadCompatExt; +use tracing::instrument; + +use crate::storage::{ + CHUNK_PREFIX, CONFIG_PATH, ConcurrencySettings, DeleteObjectsResult, ETag, + FetchConfigResult, Generation, GetRefResult, ListInfo, MANIFEST_PREFIX, REF_PREFIX, + Reader, RetriesSettings, SNAPSHOT_PREFIX, Settings, Storage, StorageError, + StorageErrorKind, StorageResult, TRANSACTION_PREFIX, UpdateConfigResult, VersionInfo, + WriteRefResult, +}; + +#[derive(Debug, Serialize, Deserialize)] +pub struct ObjectStorage { + backend: Arc, + #[serde(skip)] + /// We need to use OnceCell to allow async initialization, because serde + /// does not support async cfunction calls from deserialization. This gives + /// us a way to lazily initialize the client. + client: OnceCell>, +} + +impl ObjectStorage { + /// Create an in memory Storage implementation + /// + /// This implementation should not be used in production code. + pub async fn new_in_memory() -> Result { + let backend = Arc::new(InMemoryObjectStoreBackend); + let storage = ObjectStorage { backend, client: OnceCell::new() }; + Ok(storage) + } + + /// Create an local filesystem Storage implementation + /// + /// This implementation should not be used in production code. + pub async fn new_local_filesystem( + prefix: &StdPath, + ) -> Result { + tracing::warn!( + "The LocalFileSystem storage is not safe for concurrent commits. If more than one thread/process will attempt to commit at the same time, prefer using object stores." + ); + let backend = + Arc::new(LocalFileSystemObjectStoreBackend { path: prefix.to_path_buf() }); + let storage = ObjectStorage { backend, client: OnceCell::new() }; + Ok(storage) + } + + pub async fn new_s3( + bucket: String, + prefix: Option, + credentials: Option, + config: Option, + ) -> Result { + let backend = + Arc::new(S3ObjectStoreBackend { bucket, prefix, credentials, config }); + let storage = ObjectStorage { backend, client: OnceCell::new() }; + + Ok(storage) + } + + pub async fn new_azure( + account: String, + container: String, + prefix: Option, + credentials: Option, + config: Option>, + ) -> Result { + let backend = Arc::new(AzureObjectStoreBackend { + account, + container, + prefix, + credentials, + config, + }); + let storage = ObjectStorage { backend, client: OnceCell::new() }; + + Ok(storage) + } + + pub async fn new_gcs( + bucket: String, + prefix: Option, + credentials: Option, + config: Option>, + ) -> Result { + let backend = + Arc::new(GcsObjectStoreBackend { bucket, prefix, credentials, config }); + let storage = ObjectStorage { backend, client: OnceCell::new() }; + + Ok(storage) + } + + /// Get the client, initializing it if it hasn't been initialized yet. This is necessary because the + /// client is not serializeable and must be initialized after deserialization. Under normal construction + /// the original client is returned immediately. + #[instrument(skip_all)] + async fn get_client(&self, settings: &Settings) -> &Arc { + self.client + .get_or_init(|| async { + // TODO: handle error better? + #[allow(clippy::expect_used)] + self.backend + .mk_object_store(settings) + .expect("failed to create object store") + }) + .await + } + + /// We need this because object_store's local file implementation doesn't sort refs. Since this + /// implementation is used only for tests, it's OK to sort in memory. + pub fn artificially_sort_refs_in_mem(&self) -> bool { + self.backend.artificially_sort_refs_in_mem() + } + + /// Return all keys in the store + /// + /// Intended for testing and debugging purposes only. + pub async fn all_keys(&self) -> StorageResult> { + Ok(self + .get_client(&self.backend.default_settings()) + .await + .list(None) + .map_ok(|obj| obj.location.to_string()) + .try_collect() + .await + .map_err(Box::new)?) + } + + fn get_path_str(&self, file_prefix: &str, id: &str) -> ObjectPath { + let path = format!("{}/{}/{}", self.backend.prefix(), file_prefix, id); + ObjectPath::from(path) + } + + fn get_path( + &self, + file_prefix: &str, + id: &ObjectId, + ) -> ObjectPath { + // we serialize the url using crockford + self.get_path_str(file_prefix, id.to_string().as_str()) + } + + fn get_config_path(&self) -> ObjectPath { + self.get_path_str("", CONFIG_PATH) + } + + fn get_snapshot_path(&self, id: &SnapshotId) -> ObjectPath { + self.get_path(SNAPSHOT_PREFIX, id) + } + + fn get_manifest_path(&self, id: &ManifestId) -> ObjectPath { + self.get_path(MANIFEST_PREFIX, id) + } + + fn get_transaction_path(&self, id: &SnapshotId) -> ObjectPath { + self.get_path(TRANSACTION_PREFIX, id) + } + + fn get_chunk_path(&self, id: &ChunkId) -> ObjectPath { + self.get_path(CHUNK_PREFIX, id) + } + + fn drop_prefix(&self, prefix: &ObjectPath, path: &ObjectPath) -> Option { + path.prefix_match(&ObjectPath::from(format!("{prefix}"))).map(|it| it.collect()) + } + + fn ref_key(&self, ref_key: &str) -> ObjectPath { + // ObjectPath knows how to deal with empty path parts: bar//foo + ObjectPath::from(format!("{}/{}/{}", self.backend.prefix(), REF_PREFIX, ref_key)) + } + + async fn get_object_reader( + &self, + settings: &Settings, + path: &ObjectPath, + ) -> StorageResult> { + Ok(self + .get_client(settings) + .await + .get(path) + .await + .map_err(Box::new)? + .into_stream() + .err_into() + .into_async_read() + .compat()) + } + + fn metadata_to_attributes( + &self, + settings: &Settings, + metadata: Vec<(String, String)>, + ) -> Attributes { + if settings.unsafe_use_metadata() { + Attributes::from_iter(metadata.into_iter().map(|(key, val)| { + ( + Attribute::Metadata(std::borrow::Cow::Owned(key)), + AttributeValue::from(val), + ) + })) + } else { + Attributes::new() + } + } + + fn get_ref_name(&self, prefix: &ObjectPath, meta: &ObjectMeta) -> Option { + let relative_key = self.drop_prefix(prefix, &meta.location)?; + let parent = relative_key.parts().next()?; + Some(parent.as_ref().to_string()) + } + + fn get_put_mode( + &self, + settings: &Settings, + previous_version: &VersionInfo, + ) -> PutMode { + match ( + previous_version.is_create(), + settings.unsafe_use_conditional_create(), + settings.unsafe_use_conditional_update(), + ) { + (true, true, _) => PutMode::Create, + (true, false, _) => PutMode::Overwrite, + + (false, _, true) => PutMode::Update(UpdateVersion { + e_tag: previous_version.etag().cloned(), + version: previous_version.generation().cloned(), + }), + (false, _, false) => PutMode::Overwrite, + } + } +} + +impl fmt::Display for ObjectStorage { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "ObjectStorage(backend={})", self.backend) + } +} + +impl private::Sealed for ObjectStorage {} + +#[async_trait] +#[typetag::serde] +impl Storage for ObjectStorage { + fn can_write(&self) -> bool { + self.backend.can_write() + } + + #[instrument(skip_all)] + fn default_settings(&self) -> Settings { + self.backend.default_settings() + } + + #[instrument(skip_all)] + async fn fetch_config( + &self, + settings: &Settings, + ) -> StorageResult { + let path = self.get_config_path(); + let response = self.get_client(settings).await.get(&path).await; + + match response { + Ok(result) => { + let version = VersionInfo { + etag: result.meta.e_tag.as_ref().cloned().map(ETag), + generation: result.meta.version.as_ref().cloned().map(Generation), + }; + + Ok(FetchConfigResult::Found { + bytes: result.bytes().await.map_err(Box::new)?, + version, + }) + } + Err(object_store::Error::NotFound { .. }) => Ok(FetchConfigResult::NotFound), + Err(err) => Err(Box::new(err).into()), + } + } + #[instrument(skip(self, settings, config))] + async fn update_config( + &self, + settings: &Settings, + config: Bytes, + previous_version: &VersionInfo, + ) -> StorageResult { + let path = self.get_config_path(); + let attributes = if settings.unsafe_use_metadata() { + Attributes::from_iter(vec![( + Attribute::ContentType, + AttributeValue::from("application/yaml"), + )]) + } else { + Attributes::new() + }; + + let mode = self.get_put_mode(settings, previous_version); + + let options = PutOptions { mode, attributes, ..PutOptions::default() }; + let res = + self.get_client(settings).await.put_opts(&path, config.into(), options).await; + match res { + Ok(res) => { + let new_version = VersionInfo { + etag: res.e_tag.map(ETag), + generation: res.version.map(Generation), + }; + Ok(UpdateConfigResult::Updated { new_version }) + } + Err(object_store::Error::Precondition { .. }) => { + Ok(UpdateConfigResult::NotOnLatestVersion) + } + Err(err) => Err(Box::new(err).into()), + } + } + + #[instrument(skip(self, settings))] + async fn fetch_snapshot( + &self, + settings: &Settings, + id: &SnapshotId, + ) -> StorageResult> { + let path = self.get_snapshot_path(id); + Ok(Box::new(self.get_object_reader(settings, &path).await?)) + } + + #[instrument(skip(self, settings))] + async fn fetch_manifest_known_size( + &self, + settings: &Settings, + id: &ManifestId, + size: u64, + ) -> StorageResult { + let path = self.get_manifest_path(id); + self.get_object_concurrently(settings, path.as_ref(), &(0..size)).await + } + + #[instrument(skip(self, settings))] + async fn fetch_manifest_unknown_size( + &self, + settings: &Settings, + id: &ManifestId, + ) -> StorageResult> { + let path = self.get_manifest_path(id); + Ok(Box::new(self.get_object_reader(settings, &path).await?)) + } + + #[instrument(skip(self, settings))] + async fn fetch_transaction_log( + &self, + settings: &Settings, + id: &SnapshotId, + ) -> StorageResult> { + let path = self.get_transaction_path(id); + Ok(Box::new(self.get_object_reader(settings, &path).await?)) + } + + #[instrument(skip(self, settings, metadata, bytes))] + async fn write_snapshot( + &self, + settings: &Settings, + id: SnapshotId, + metadata: Vec<(String, String)>, + bytes: Bytes, + ) -> StorageResult<()> { + let path = self.get_snapshot_path(&id); + let attributes = self.metadata_to_attributes(settings, metadata); + let options = PutOptions { attributes, ..PutOptions::default() }; + // FIXME: use multipart + self.get_client(settings) + .await + .put_opts(&path, bytes.into(), options) + .await + .map_err(Box::new)?; + Ok(()) + } + + #[instrument(skip(self, settings, metadata, bytes))] + async fn write_manifest( + &self, + settings: &Settings, + id: ManifestId, + metadata: Vec<(String, String)>, + bytes: Bytes, + ) -> StorageResult<()> { + let path = self.get_manifest_path(&id); + let attributes = self.metadata_to_attributes(settings, metadata); + let options = PutOptions { attributes, ..PutOptions::default() }; + // FIXME: use multipart + self.get_client(settings) + .await + .put_opts(&path, bytes.into(), options) + .await + .map_err(Box::new)?; + Ok(()) + } + + #[instrument(skip(self, settings, metadata, bytes))] + async fn write_transaction_log( + &self, + settings: &Settings, + id: SnapshotId, + metadata: Vec<(String, String)>, + bytes: Bytes, + ) -> StorageResult<()> { + let path = self.get_transaction_path(&id); + let attributes = self.metadata_to_attributes(settings, metadata); + let options = PutOptions { attributes, ..PutOptions::default() }; + // FIXME: use multipart + self.get_client(settings) + .await + .put_opts(&path, bytes.into(), options) + .await + .map_err(Box::new)?; + Ok(()) + } + + #[instrument(skip(self, settings))] + async fn fetch_chunk( + &self, + settings: &Settings, + id: &ChunkId, + range: &Range, + ) -> Result { + let path = self.get_chunk_path(id); + self.get_object_concurrently(settings, path.as_ref(), range) + .await? + .to_bytes((range.end - range.start + 16) as usize) + .await + } + + #[instrument(skip(self, settings, bytes))] + async fn write_chunk( + &self, + settings: &Settings, + id: ChunkId, + bytes: Bytes, + ) -> Result<(), StorageError> { + let path = self.get_chunk_path(&id); + self.get_client(settings) + .await + .put(&path, bytes.into()) + .await + .map_err(Box::new)?; + Ok(()) + } + + #[instrument(skip(self, settings))] + async fn get_ref( + &self, + settings: &Settings, + ref_key: &str, + ) -> StorageResult { + let key = self.ref_key(ref_key); + match self.get_client(settings).await.get(&key).await { + Ok(res) => { + let etag = res.meta.e_tag.clone().map(ETag); + let generation = res.meta.version.clone().map(Generation); + Ok(GetRefResult::Found { + bytes: res.bytes().await.map_err(Box::new)?, + version: VersionInfo { etag, generation }, + }) + } + Err(object_store::Error::NotFound { .. }) => Ok(GetRefResult::NotFound), + Err(err) => Err(Box::new(err).into()), + } + } + + #[instrument(skip(self, settings))] + async fn ref_names(&self, settings: &Settings) -> StorageResult> { + let prefix = &self.ref_key(""); + + Ok(self + .get_client(settings) + .await + .list(Some(prefix.clone()).as_ref()) + .try_filter_map(|meta| async move { + let name = self.get_ref_name(prefix, &meta); + if name.is_none() { + tracing::error!(object = ?meta, "Bad ref name") + } + Ok(name) + }) + .try_collect() + .await + .map_err(Box::new)?) + } + + #[instrument(skip(self, settings, bytes))] + async fn write_ref( + &self, + settings: &Settings, + ref_key: &str, + bytes: Bytes, + previous_version: &VersionInfo, + ) -> StorageResult { + let key = self.ref_key(ref_key); + let mode = self.get_put_mode(settings, previous_version); + let opts = PutOptions { mode, ..PutOptions::default() }; + + match self + .get_client(settings) + .await + .put_opts(&key, PutPayload::from_bytes(bytes), opts) + .await + { + Ok(_) => Ok(WriteRefResult::Written), + Err(object_store::Error::Precondition { .. }) + | Err(object_store::Error::AlreadyExists { .. }) => { + Ok(WriteRefResult::WontOverwrite) + } + Err(err) => Err(Box::new(err).into()), + } + } + + #[instrument(skip(self, settings))] + async fn list_objects<'a>( + &'a self, + settings: &Settings, + prefix: &str, + ) -> StorageResult>>> { + let prefix = ObjectPath::from(format!("{}/{}", self.backend.prefix(), prefix)); + let stream = self + .get_client(settings) + .await + .list(Some(&prefix)) + // TODO: we should signal error instead of filtering + .try_filter_map(|object| async move { + let info = object_to_list_info(&object); + if info.is_none() { + tracing::error!(object=?object, "Found bad object while listing"); + } + Ok(info) + }) + .map_err(Box::new) + .err_into(); + Ok(stream.boxed()) + } + + #[instrument(skip(self, batch))] + async fn delete_batch( + &self, + settings: &Settings, + prefix: &str, + batch: Vec<(String, u64)>, + ) -> StorageResult { + let mut sizes = HashMap::new(); + let mut ids = Vec::new(); + for (id, size) in batch { + let path = self.get_path_str(prefix, id.as_str()); + ids.push(Ok(path.clone())); + sizes.insert(path, size); + } + let results = + self.get_client(settings).await.delete_stream(stream::iter(ids).boxed()); + let res = results + .fold(DeleteObjectsResult::default(), |mut res, delete_result| { + if let Ok(deleted_path) = delete_result { + if let Some(size) = sizes.get(&deleted_path) { + res.deleted_objects += 1; + res.deleted_bytes += *size; + } + } else { + tracing::error!( + error = ?delete_result, + "Error deleting object", + ); + } + ready(res) + }) + .await; + Ok(res) + } + + #[instrument(skip(self, settings))] + async fn get_snapshot_last_modified( + &self, + settings: &Settings, + snapshot: &SnapshotId, + ) -> StorageResult> { + let path = self.get_snapshot_path(snapshot); + let res = self.get_client(settings).await.head(&path).await.map_err(Box::new)?; + Ok(res.last_modified) + } + + #[instrument(skip(self))] + async fn get_object_range_buf( + &self, + settings: &Settings, + key: &str, + range: &Range, + ) -> StorageResult> { + let path = ObjectPath::from(key); + let usize_range = range.start..range.end; + let range = Some(usize_range.into()); + let opts = GetOptions { range, ..Default::default() }; + Ok(Box::new( + self.get_client(settings) + .await + .get_opts(&path, opts) + .await + .map_err(Box::new)? + .bytes() + .await + .map_err(Box::new)?, + )) + } + + #[instrument(skip(self))] + async fn get_object_range_read( + &self, + settings: &Settings, + key: &str, + range: &Range, + ) -> StorageResult> { + let path = ObjectPath::from(key); + let usize_range = range.start..range.end; + let range = Some(usize_range.into()); + let opts = GetOptions { range, ..Default::default() }; + let res: Box = Box::new( + self.get_client(settings) + .await + .get_opts(&path, opts) + .await + .map_err(Box::new)? + .into_stream() + .err_into() + .into_async_read() + .compat(), + ); + Ok(res) + } +} + +#[typetag::serde(tag = "object_store_provider_type")] +pub trait ObjectStoreBackend: Debug + Display + Sync + Send { + fn mk_object_store( + &self, + settings: &Settings, + ) -> Result, StorageError>; + + /// The prefix for the object store. + fn prefix(&self) -> String; + + /// We need this because object_store's local file implementation doesn't sort refs. Since this + /// implementation is used only for tests, it's OK to sort in memory. + fn artificially_sort_refs_in_mem(&self) -> bool { + false + } + + fn default_settings(&self) -> Settings; + + fn can_write(&self) -> bool { + true + } +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct InMemoryObjectStoreBackend; + +impl fmt::Display for InMemoryObjectStoreBackend { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "InMemoryObjectStoreBackend") + } +} + +#[typetag::serde(name = "in_memory_object_store_provider")] +impl ObjectStoreBackend for InMemoryObjectStoreBackend { + fn mk_object_store( + &self, + _settings: &Settings, + ) -> Result, StorageError> { + Ok(Arc::new(InMemory::new())) + } + + fn prefix(&self) -> String { + "".to_string() + } + + fn default_settings(&self) -> Settings { + Settings { + concurrency: Some(ConcurrencySettings { + // we do != 1 because we use this store for tests + max_concurrent_requests_for_object: Some( + NonZeroU16::new(5).unwrap_or(NonZeroU16::MIN), + ), + ideal_concurrent_request_size: Some( + NonZeroU64::new(1).unwrap_or(NonZeroU64::MIN), + ), + }), + retries: Some(RetriesSettings { + max_tries: Some(NonZeroU16::MIN), + initial_backoff_ms: Some(0), + max_backoff_ms: Some(0), + }), + + ..Default::default() + } + } +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct LocalFileSystemObjectStoreBackend { + path: PathBuf, +} + +impl fmt::Display for LocalFileSystemObjectStoreBackend { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "LocalFileSystemObjectStoreBackend(path={})", self.path.display()) + } +} + +#[typetag::serde(name = "local_file_system_object_store_provider")] +impl ObjectStoreBackend for LocalFileSystemObjectStoreBackend { + fn mk_object_store( + &self, + _settings: &Settings, + ) -> Result, StorageError> { + create_dir_all(&self.path).map_err(|e| StorageErrorKind::Other(e.to_string()))?; + + let path = std::fs::canonicalize(&self.path) + .map_err(|e| StorageErrorKind::Other(e.to_string()))?; + Ok(Arc::new( + LocalFileSystem::new_with_prefix(path) + .map_err(|e| StorageErrorKind::Other(e.to_string()))?, + )) + } + + fn prefix(&self) -> String { + "".to_string() + } + + fn artificially_sort_refs_in_mem(&self) -> bool { + true + } + + fn default_settings(&self) -> Settings { + Settings { + concurrency: Some(ConcurrencySettings { + max_concurrent_requests_for_object: Some( + NonZeroU16::new(5).unwrap_or(NonZeroU16::MIN), + ), + ideal_concurrent_request_size: Some( + NonZeroU64::new(4 * 1024).unwrap_or(NonZeroU64::MIN), + ), + }), + unsafe_use_conditional_update: Some(false), + unsafe_use_metadata: Some(false), + retries: Some(RetriesSettings { + max_tries: Some(NonZeroU16::new(1).unwrap_or(NonZeroU16::MIN)), + initial_backoff_ms: Some(0), + max_backoff_ms: Some(0), + }), + ..Default::default() + } + } +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct HttpObjectStoreBackend { + pub url: String, + pub config: Option>, +} + +impl fmt::Display for HttpObjectStoreBackend { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "HttpObjectStoreBackend(url={}, config={})", + self.url, + self.config + .as_ref() + .map(|c| c + .iter() + .map(|(k, v)| format!("{k:?}={v}")) + .collect::>() + .join(", ")) + .unwrap_or("None".to_string()) + ) + } +} + +#[typetag::serde(name = "http_object_store_provider")] +impl ObjectStoreBackend for HttpObjectStoreBackend { + fn mk_object_store( + &self, + settings: &Settings, + ) -> Result, StorageError> { + let builder = HttpBuilder::new().with_url(&self.url); + + // Add options + let builder = self + .config + .as_ref() + .unwrap_or(&HashMap::new()) + .iter() + .fold(builder, |builder, (key, value)| builder.with_config(*key, value)); + + let builder = builder.with_retry(RetryConfig { + backoff: BackoffConfig { + init_backoff: core::time::Duration::from_millis( + settings.retries().initial_backoff_ms() as u64, + ), + max_backoff: core::time::Duration::from_millis( + settings.retries().max_backoff_ms() as u64, + ), + base: 2., + }, + max_retries: settings.retries().max_tries().get() as usize - 1, + retry_timeout: core::time::Duration::from_secs(5 * 60), + }); + + let store = + builder.build().map_err(|e| StorageErrorKind::Other(e.to_string()))?; + + Ok(Arc::new(store)) + } + + fn prefix(&self) -> String { + "".to_string() + } + + fn default_settings(&self) -> Settings { + Default::default() + } + + fn can_write(&self) -> bool { + // TODO: Support write operations? + false + } +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct S3ObjectStoreBackend { + bucket: String, + prefix: Option, + credentials: Option, + config: Option, +} + +impl fmt::Display for S3ObjectStoreBackend { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "S3ObjectStoreBackend(bucket={}, prefix={}, config={})", + self.bucket, + self.prefix.as_deref().unwrap_or(""), + self.config.as_ref().map(|c| c.to_string()).unwrap_or("None".to_string()) + ) + } +} + +#[typetag::serde(name = "s3_object_store_provider")] +impl ObjectStoreBackend for S3ObjectStoreBackend { + fn mk_object_store( + &self, + settings: &Settings, + ) -> Result, StorageError> { + let builder = AmazonS3Builder::new(); + + let builder = match self.credentials.as_ref() { + Some(S3Credentials::Static(credentials)) => { + let builder = builder + .with_access_key_id(credentials.access_key_id.clone()) + .with_secret_access_key(credentials.secret_access_key.clone()); + + if let Some(session_token) = credentials.session_token.as_ref() { + builder.with_token(session_token.clone()) + } else { + builder + } + } + Some(S3Credentials::Anonymous) => builder.with_skip_signature(true), + // TODO: Support refreshable credentials + _ => AmazonS3Builder::from_env(), + }; + + let builder = if let Some(config) = self.config.as_ref() { + let builder = if let Some(region) = config.region.as_ref() { + builder.with_region(region.to_string()) + } else { + builder + }; + + let builder = if let Some(endpoint) = config.endpoint_url.as_ref() { + builder.with_endpoint(endpoint.to_string()) + } else { + builder + }; + + builder + .with_skip_signature(config.anonymous) + .with_allow_http(config.allow_http) + } else { + builder + }; + + // Defaults + let builder = builder + .with_bucket_name(&self.bucket) + .with_conditional_put(object_store::aws::S3ConditionalPut::ETagMatch); + + let builder = builder.with_retry(RetryConfig { + backoff: BackoffConfig { + init_backoff: core::time::Duration::from_millis( + settings.retries().initial_backoff_ms() as u64, + ), + max_backoff: core::time::Duration::from_millis( + settings.retries().max_backoff_ms() as u64, + ), + base: 2., + }, + max_retries: settings.retries().max_tries().get() as usize - 1, + retry_timeout: core::time::Duration::from_secs(5 * 60), + }); + + let store = + builder.build().map_err(|e| StorageErrorKind::Other(e.to_string()))?; + Ok(Arc::new(store)) + } + + fn prefix(&self) -> String { + self.prefix.clone().unwrap_or("".to_string()) + } + + fn default_settings(&self) -> Settings { + Default::default() + } +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct AzureObjectStoreBackend { + account: String, + container: String, + prefix: Option, + credentials: Option, + config: Option>, +} + +impl fmt::Display for AzureObjectStoreBackend { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "AzureObjectStoreBackend(account={}, container={}, prefix={})", + self.account, + self.container, + self.prefix.as_deref().unwrap_or("") + ) + } +} + +#[typetag::serde(name = "azure_object_store_provider")] +impl ObjectStoreBackend for AzureObjectStoreBackend { + fn mk_object_store( + &self, + settings: &Settings, + ) -> Result, StorageError> { + let builder = MicrosoftAzureBuilder::new(); + + let builder = match self.credentials.as_ref() { + Some(AzureCredentials::Static(AzureStaticCredentials::AccessKey(key))) => { + builder.with_access_key(key) + } + Some(AzureCredentials::Static(AzureStaticCredentials::SASToken(token))) => { + builder.with_config(AzureConfigKey::SasKey, token) + } + Some(AzureCredentials::Static(AzureStaticCredentials::BearerToken( + token, + ))) => builder.with_bearer_token_authorization(token), + None | Some(AzureCredentials::FromEnv) => MicrosoftAzureBuilder::from_env(), + }; + + // Either the account name should be provided or user_emulator should be set to true to use the default account + let builder = + builder.with_account(&self.account).with_container_name(&self.container); + + let builder = self + .config + .as_ref() + .unwrap_or(&HashMap::new()) + .iter() + .fold(builder, |builder, (key, value)| builder.with_config(*key, value)); + + let builder = builder.with_retry(RetryConfig { + backoff: BackoffConfig { + init_backoff: core::time::Duration::from_millis( + settings.retries().initial_backoff_ms() as u64, + ), + max_backoff: core::time::Duration::from_millis( + settings.retries().max_backoff_ms() as u64, + ), + base: 2., + }, + max_retries: settings.retries().max_tries().get() as usize - 1, + retry_timeout: core::time::Duration::from_secs(5 * 60), + }); + + let store = + builder.build().map_err(|e| StorageErrorKind::Other(e.to_string()))?; + Ok(Arc::new(store)) + } + + fn prefix(&self) -> String { + self.prefix.clone().unwrap_or("".to_string()) + } + + fn default_settings(&self) -> Settings { + Default::default() + } +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct GcsObjectStoreBackend { + pub bucket: String, + pub prefix: Option, + pub credentials: Option, + pub config: Option>, +} + +impl fmt::Display for GcsObjectStoreBackend { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "GcsObjectStoreBackend(bucket={}, prefix={})", + self.bucket, + self.prefix.as_deref().unwrap_or("") + ) + } +} + +#[typetag::serde(name = "gcs_object_store_provider")] +impl ObjectStoreBackend for GcsObjectStoreBackend { + fn mk_object_store( + &self, + settings: &Settings, + ) -> Result, StorageError> { + let builder = GoogleCloudStorageBuilder::new(); + + let builder = match self.credentials.as_ref() { + Some(GcsCredentials::Static(GcsStaticCredentials::ServiceAccount(path))) => { + let path = path.clone().into_os_string().into_string().map_err(|_| { + StorageErrorKind::Other("invalid service account path".to_string()) + })?; + builder.with_service_account_path(path) + } + Some(GcsCredentials::Static(GcsStaticCredentials::ServiceAccountKey( + key, + ))) => builder.with_service_account_key(key), + Some(GcsCredentials::Static( + GcsStaticCredentials::ApplicationCredentials(path), + )) => { + let path = path.clone().into_os_string().into_string().map_err(|_| { + StorageErrorKind::Other( + "invalid application credentials path".to_string(), + ) + })?; + builder.with_application_credentials(path) + } + Some(GcsCredentials::Static(GcsStaticCredentials::BearerToken(token))) => { + let provider = StaticCredentialProvider::new(GcpCredential::from(token)); + builder.with_credentials(Arc::new(provider)) + } + Some(GcsCredentials::Refreshable(fetcher)) => { + let credential_provider = + GcsRefreshableCredentialProvider::new(Arc::clone(fetcher)); + builder.with_credentials(Arc::new(credential_provider)) + } + Some(GcsCredentials::Anonymous) => builder.with_skip_signature(true), + None | Some(GcsCredentials::FromEnv) => GoogleCloudStorageBuilder::from_env(), + }; + + let builder = builder.with_bucket_name(&self.bucket); + + // Add options + let builder = self + .config + .as_ref() + .unwrap_or(&HashMap::new()) + .iter() + .fold(builder, |builder, (key, value)| builder.with_config(*key, value)); + + let builder = builder.with_retry(RetryConfig { + backoff: BackoffConfig { + init_backoff: core::time::Duration::from_millis( + settings.retries().initial_backoff_ms() as u64, + ), + max_backoff: core::time::Duration::from_millis( + settings.retries().max_backoff_ms() as u64, + ), + base: 2., + }, + max_retries: settings.retries().max_tries().get() as usize - 1, + retry_timeout: core::time::Duration::from_secs(5 * 60), + }); + let store = + builder.build().map_err(|e| StorageErrorKind::Other(e.to_string()))?; + Ok(Arc::new(store)) + } + + fn prefix(&self) -> String { + self.prefix.clone().unwrap_or("".to_string()) + } + + fn default_settings(&self) -> Settings { + Default::default() + } +} + +#[derive(Debug)] +pub struct GcsRefreshableCredentialProvider { + last_credential: Arc>>, + refresher: Arc, +} + +impl GcsRefreshableCredentialProvider { + pub fn new(refresher: Arc) -> Self { + Self { last_credential: Arc::new(RwLock::new(None)), refresher } + } + + pub async fn get_or_update_credentials( + &self, + ) -> Result { + let last_credential = self.last_credential.read().await; + + // If we have a credential and it hasn't expired, return it + if let Some(creds) = last_credential.as_ref() { + if let Some(expires_after) = creds.expires_after { + if expires_after + > Utc::now() + TimeDelta::seconds(rand::random_range(120..=180)) + { + return Ok(creds.clone()); + } + } + } + + drop(last_credential); + let mut last_credential = self.last_credential.write().await; + + // Otherwise, refresh the credential and cache it + let creds = self + .refresher + .get() + .await + .map_err(|e| StorageErrorKind::Other(e.to_string()))?; + *last_credential = Some(creds.clone()); + Ok(creds) + } +} + +#[async_trait] +impl CredentialProvider for GcsRefreshableCredentialProvider { + type Credential = GcpCredential; + + async fn get_credential(&self) -> object_store::Result> { + let creds = self.get_or_update_credentials().await.map_err(|e| { + object_store::Error::Generic { store: "gcp", source: Box::new(e) } + })?; + Ok(Arc::new(GcpCredential::from(&creds))) + } +} + +fn object_to_list_info(object: &ObjectMeta) -> Option> { + let created_at = object.last_modified; + let id = object.location.filename()?.to_string(); + let size_bytes = object.size; + Some(ListInfo { id, created_at, size_bytes }) +} + +#[cfg(test)] +#[allow(clippy::expect_used, clippy::unwrap_used)] +mod tests { + use std::path::PathBuf; + + use icechunk_macros::tokio_test; + use tempfile::TempDir; + + use crate::format::{ChunkId, ManifestId, SnapshotId}; + + use super::ObjectStorage; + + #[tokio_test] + async fn test_serialize_object_store() { + let tmp_dir = TempDir::new().unwrap(); + let store = ObjectStorage::new_local_filesystem(tmp_dir.path()).await.unwrap(); + + let serialized = serde_json::to_string(&store).unwrap(); + + let deserialized: ObjectStorage = serde_json::from_str(&serialized).unwrap(); + assert_eq!( + store.backend.as_ref().prefix(), + deserialized.backend.as_ref().prefix() + ); + } + + struct TestLocalPath(String); + + impl From<&TestLocalPath> for std::path::PathBuf { + fn from(path: &TestLocalPath) -> Self { + std::path::PathBuf::from(&path.0) + } + } + + impl Drop for TestLocalPath { + fn drop(&mut self) { + let _ = std::fs::remove_dir_all(&self.0); + } + } + + #[tokio_test] + async fn test_canonicalize_path() { + // Absolute path + let tmp_dir = TempDir::new().unwrap(); + let store = ObjectStorage::new_local_filesystem(tmp_dir.path()).await; + assert!(store.is_ok()); + + // Relative path + let rel_path = "relative/path"; + let store = + ObjectStorage::new_local_filesystem(PathBuf::from(&rel_path).as_path()).await; + assert!(store.is_ok()); + + // Relative with leading ./ + let rel_path = TestLocalPath("./other/path".to_string()); + let store = + ObjectStorage::new_local_filesystem(PathBuf::from(&rel_path).as_path()).await; + assert!(store.is_ok()); + } + + #[tokio_test] + async fn test_object_store_paths() { + let store = ObjectStorage::new_local_filesystem(PathBuf::from(".").as_path()) + .await + .unwrap(); + + let ref_key = "ref_key"; + let ref_path = store.ref_key(ref_key); + assert_eq!(ref_path.to_string(), format!("refs/{ref_key}")); + + let snapshot_id = SnapshotId::random(); + let snapshot_path = store.get_snapshot_path(&snapshot_id); + assert_eq!(snapshot_path.to_string(), format!("snapshots/{snapshot_id}")); + + let manifest_id = ManifestId::random(); + let manifest_path = store.get_manifest_path(&manifest_id); + assert_eq!(manifest_path.to_string(), format!("manifests/{manifest_id}")); + + let chunk_id = ChunkId::random(); + let chunk_path = store.get_chunk_path(&chunk_id); + assert_eq!(chunk_path.to_string(), format!("chunks/{chunk_id}")); + + let transaction_id = SnapshotId::random(); + let transaction_path = store.get_transaction_path(&transaction_id); + assert_eq!( + transaction_path.to_string(), + format!("transactions/{transaction_id}") + ); + } +} diff --git a/icechunk/src/storage/implementations/s3.rs b/icechunk/src/storage/implementations/s3.rs new file mode 100644 index 000000000..9f6f3cc75 --- /dev/null +++ b/icechunk/src/storage/implementations/s3.rs @@ -0,0 +1,1144 @@ +use std::{ + collections::HashMap, + fmt, + future::ready, + ops::Range, + path::{Path, PathBuf}, + sync::Arc, +}; + +use crate::{ + config::{S3Credentials, S3CredentialsFetcher, S3Options}, + format::{ChunkId, ChunkOffset, FileTypeTag, ManifestId, ObjectId, SnapshotId}, + private, + storage::{Storage, StorageError}, +}; +use async_trait::async_trait; +use aws_config::{ + AppName, BehaviorVersion, + meta::region::RegionProviderChain, + retry::{ProvideErrorKind, RetryConfig}, +}; +use aws_credential_types::provider::error::CredentialsError; +use aws_sdk_s3::{ + Client, + config::{ + Builder, ConfigBag, IdentityCache, Intercept, ProvideCredentials, Region, + RuntimeComponents, interceptors::BeforeTransmitInterceptorContextMut, + }, + error::{BoxError, SdkError}, + operation::put_object::PutObjectError, + primitives::ByteStream, + types::{CompletedMultipartUpload, CompletedPart, Delete, Object, ObjectIdentifier}, +}; +use aws_smithy_types_convert::{date_time::DateTimeExt, stream::PaginationStreamExt}; +use bytes::{Buf, Bytes}; +use chrono::{DateTime, Utc}; +use futures::{ + StreamExt, TryStreamExt, + stream::{self, BoxStream, FuturesOrdered}, +}; +use serde::{Deserialize, Serialize}; +use tokio::{io::AsyncRead, sync::OnceCell}; +use tracing::{error, instrument}; + +use crate::storage::{ + CHUNK_PREFIX, CONFIG_PATH, DeleteObjectsResult, FetchConfigResult, GetRefResult, + ListInfo, MANIFEST_PREFIX, REF_PREFIX, Reader, SNAPSHOT_PREFIX, Settings, + StorageErrorKind, StorageResult, TRANSACTION_PREFIX, UpdateConfigResult, VersionInfo, + WriteRefResult, split_in_multiple_equal_requests, +}; + +#[derive(Debug, Serialize, Deserialize)] +pub struct S3Storage { + // config and credentials are stored so we are able to serialize and deserialize the struct + config: S3Options, + credentials: S3Credentials, + bucket: String, + prefix: String, + can_write: bool, + extra_read_headers: Vec<(String, String)>, + extra_write_headers: Vec<(String, String)>, + #[serde(skip)] + /// We need to use OnceCell to allow async initialization, because serde + /// does not support async cfunction calls from deserialization. This gives + /// us a way to lazily initialize the client. + client: OnceCell>, +} + +impl fmt::Display for S3Storage { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "S3Storage(bucket={}, prefix={}, config={})", + self.bucket, self.prefix, self.config, + ) + } +} +#[derive(Debug)] +struct ExtraHeadersInterceptor { + extra_read_headers: Vec<(String, String)>, + extra_write_headers: Vec<(String, String)>, +} + +impl Intercept for ExtraHeadersInterceptor { + fn name(&self) -> &'static str { + "ExtraHeaders" + } + + fn modify_before_retry_loop( + &self, + context: &mut BeforeTransmitInterceptorContextMut<'_>, + _runtime_components: &RuntimeComponents, + _cfg: &mut ConfigBag, + ) -> Result<(), BoxError> { + let request = context.request_mut(); + match request.method() { + "GET" | "HEAD" | "OPTIONS" | "TRACE" => { + for (k, v) in self.extra_read_headers.iter() { + request.headers_mut().insert(k.clone(), v.clone()); + } + } + _ => { + for (k, v) in self.extra_write_headers.iter() { + request.headers_mut().insert(k.clone(), v.clone()); + } + } + } + Ok(()) + } +} + +#[instrument(skip(credentials))] +pub async fn mk_client( + config: &S3Options, + credentials: S3Credentials, + extra_read_headers: Vec<(String, String)>, + extra_write_headers: Vec<(String, String)>, + settings: &Settings, +) -> Client { + let region = config + .region + .as_ref() + .map(|r| RegionProviderChain::first_try(Some(Region::new(r.clone())))) + .unwrap_or_else(RegionProviderChain::default_provider); + + let endpoint = config.endpoint_url.clone(); + let region = if endpoint.is_some() { + // GH793, the S3 SDK requires a region even though it may not make sense + // for S3-compatible object stores like Tigris or Ceph. + // So we set a fake region, using the `endpoint_url` as a sign that + // we are not talking to real S3 + region.or_else(Region::new("region-was-not-set")) + } else { + region + }; + + #[allow(clippy::unwrap_used)] + let app_name = AppName::new("icechunk").unwrap(); + let mut aws_config = aws_config::defaults(BehaviorVersion::v2025_01_17()) + .region(region) + .app_name(app_name); + + if let Some(endpoint) = endpoint { + aws_config = aws_config.endpoint_url(endpoint) + } + + match credentials { + S3Credentials::FromEnv => {} + S3Credentials::Anonymous => aws_config = aws_config.no_credentials(), + S3Credentials::Static(credentials) => { + aws_config = + aws_config.credentials_provider(aws_credential_types::Credentials::new( + credentials.access_key_id, + credentials.secret_access_key, + credentials.session_token, + credentials.expires_after.map(|e| e.into()), + "user", + )); + } + S3Credentials::Refreshable(fetcher) => { + aws_config = + aws_config.credentials_provider(ProvideRefreshableCredentials(fetcher)); + } + } + + let retry_config = RetryConfig::standard() + .with_max_attempts(settings.retries().max_tries().get() as u32) + .with_initial_backoff(core::time::Duration::from_millis( + settings.retries().initial_backoff_ms() as u64, + )) + .with_max_backoff(core::time::Duration::from_millis( + settings.retries().max_backoff_ms() as u64, + )); + + let mut s3_builder = Builder::from(&aws_config.load().await) + .force_path_style(config.force_path_style) + .retry_config(retry_config); + + // credentials may take a while to refresh, defaults are too strict + let id_cache = IdentityCache::lazy() + .load_timeout(core::time::Duration::from_secs(120)) + .buffer_time(core::time::Duration::from_secs(120)) + .build(); + + s3_builder = s3_builder.identity_cache(id_cache); + + if !extra_read_headers.is_empty() || !extra_write_headers.is_empty() { + s3_builder = s3_builder.interceptor(ExtraHeadersInterceptor { + extra_read_headers, + extra_write_headers, + }) + } + + let config = s3_builder.build(); + + Client::from_conf(config) +} + +impl S3Storage { + pub fn new( + config: S3Options, + bucket: String, + prefix: Option, + credentials: S3Credentials, + can_write: bool, + extra_read_headers: Vec<(String, String)>, + extra_write_headers: Vec<(String, String)>, + ) -> Result { + let client = OnceCell::new(); + Ok(S3Storage { + client, + config, + bucket, + prefix: prefix.unwrap_or_default(), + credentials, + can_write, + extra_read_headers, + extra_write_headers, + }) + } + + /// Get the client, initializing it if it hasn't been initialized yet. This is necessary because the + /// client is not serializeable and must be initialized after deserialization. Under normal construction + /// the original client is returned immediately. + #[instrument(skip_all)] + async fn get_client(&self, settings: &Settings) -> &Arc { + self.client + .get_or_init(|| async { + Arc::new( + mk_client( + &self.config, + self.credentials.clone(), + self.extra_read_headers.clone(), + self.extra_write_headers.clone(), + settings, + ) + .await, + ) + }) + .await + } + + fn get_path_str(&self, file_prefix: &str, id: &str) -> StorageResult { + let path = PathBuf::from_iter([self.prefix.as_str(), file_prefix, id]); + let path_str = + path.into_os_string().into_string().map_err(StorageErrorKind::BadPrefix)?; + + Ok(path_str.replace("\\", "/")) + } + + fn get_path( + &self, + file_prefix: &str, + id: &ObjectId, + ) -> StorageResult { + // we serialize the url using crockford + self.get_path_str(file_prefix, id.to_string().as_str()) + } + + fn get_config_path(&self) -> StorageResult { + self.get_path_str("", CONFIG_PATH) + } + + fn get_snapshot_path(&self, id: &SnapshotId) -> StorageResult { + self.get_path(SNAPSHOT_PREFIX, id) + } + + fn get_manifest_path(&self, id: &ManifestId) -> StorageResult { + self.get_path(MANIFEST_PREFIX, id) + } + + fn get_chunk_path(&self, id: &ChunkId) -> StorageResult { + self.get_path(CHUNK_PREFIX, id) + } + + fn get_transaction_path(&self, id: &SnapshotId) -> StorageResult { + self.get_path(TRANSACTION_PREFIX, id) + } + + fn ref_key(&self, ref_key: &str) -> StorageResult { + let path = PathBuf::from_iter([self.prefix.as_str(), REF_PREFIX, ref_key]); + let path_str = + path.into_os_string().into_string().map_err(StorageErrorKind::BadPrefix)?; + + Ok(path_str.replace("\\", "/")) + } + + async fn get_object_reader( + &self, + settings: &Settings, + key: &str, + ) -> StorageResult> { + let client = self.get_client(settings).await; + let b = client.get_object().bucket(self.bucket.as_str()).key(key); + Ok(Box::new(b.send().await.map_err(Box::new)?.body.into_async_read())) + } + + async fn put_object_single< + I: IntoIterator, impl Into)>, + >( + &self, + settings: &Settings, + key: &str, + content_type: Option>, + metadata: I, + storage_class: Option<&String>, + bytes: impl Into, + ) -> StorageResult<()> { + let mut b = self + .get_client(settings) + .await + .put_object() + .bucket(self.bucket.clone()) + .key(key); + + if settings.unsafe_use_metadata() { + if let Some(ct) = content_type { + b = b.content_type(ct) + }; + } + + if settings.unsafe_use_metadata() { + for (k, v) in metadata { + b = b.metadata(k, v); + } + } + + if let Some(klass) = storage_class { + let klass = klass.as_str().into(); + b = b.storage_class(klass); + } + + b.body(bytes.into()).send().await.map_err(Box::new)?; + Ok(()) + } + + async fn put_object_multipart< + I: IntoIterator, impl Into)>, + >( + &self, + settings: &Settings, + key: &str, + content_type: Option>, + metadata: I, + storage_class: Option<&String>, + bytes: &Bytes, + ) -> StorageResult<()> { + let mut multi = self + .get_client(settings) + .await + .create_multipart_upload() + // We would like this, but it fails in MinIO + //.checksum_type(aws_sdk_s3::types::ChecksumType::FullObject) + //.checksum_algorithm(aws_sdk_s3::types::ChecksumAlgorithm::Crc64Nvme) + .bucket(self.bucket.clone()) + .key(key); + + if settings.unsafe_use_metadata() { + if let Some(ct) = content_type { + multi = multi.content_type(ct) + }; + for (k, v) in metadata { + multi = multi.metadata(k, v); + } + } + + if let Some(klass) = storage_class { + let klass = klass.as_str().into(); + multi = multi.storage_class(klass); + } + + let create_res = multi.send().await.map_err(Box::new)?; + let upload_id = + create_res.upload_id().ok_or(StorageError::from(StorageErrorKind::Other( + "No upload_id in create multipart upload result".to_string(), + )))?; + + // We need to ensure all requests are the same size except for the last one, which can be + // smaller. This is a requirement for R2 compatibility + let parts = split_in_multiple_equal_requests( + &(0..bytes.len() as u64), + settings.concurrency().ideal_concurrent_request_size().get(), + settings.concurrency().max_concurrent_requests_for_object().get(), + ) + .collect::>(); + + let results = parts + .into_iter() + .enumerate() + .map(|(part_idx, range)| async move { + let body = bytes.slice(range.start as usize..range.end as usize).into(); + let idx = part_idx as i32 + 1; + self.get_client(settings) + .await + .upload_part() + .upload_id(upload_id) + .bucket(self.bucket.clone()) + .key(key) + .part_number(idx) + .body(body) + .send() + .await + .map(|res| (idx, res)) + }) + .collect::>(); + + let completed_parts = results + .map_ok(|(idx, res)| { + let etag = res.e_tag().unwrap_or(""); + CompletedPart::builder() + .e_tag(strip_quotes(etag)) + .part_number(idx) + .build() + }) + .try_collect::>() + .await + .map_err(Box::new)?; + + let completed_parts = + CompletedMultipartUpload::builder().set_parts(Some(completed_parts)).build(); + + self.get_client(settings) + .await + .complete_multipart_upload() + .bucket(self.bucket.clone()) + .key(key) + .upload_id(upload_id) + //.checksum_type(aws_sdk_s3::types::ChecksumType::FullObject) + .multipart_upload(completed_parts) + .send() + .await + .map_err(Box::new)?; + + Ok(()) + } + + async fn put_object< + I: IntoIterator, impl Into)>, + >( + &self, + settings: &Settings, + key: &str, + content_type: Option>, + metadata: I, + storage_class: Option<&String>, + bytes: &Bytes, + ) -> StorageResult<()> { + if bytes.len() >= settings.minimum_size_for_multipart_upload() as usize { + self.put_object_multipart( + settings, + key, + content_type, + metadata, + storage_class, + bytes, + ) + .await + } else { + self.put_object_single( + settings, + key, + content_type, + metadata, + storage_class, + bytes.clone(), + ) + .await + } + } + + fn get_ref_name<'a>(&self, key: Option<&'a str>) -> Option<&'a str> { + let key = key?; + let prefix = self.ref_key("").ok()?; + let relative_key = key.strip_prefix(&prefix)?; + let ref_name = relative_key.split('/').next()?; + Some(ref_name) + } +} + +pub fn range_to_header(range: &Range) -> String { + format!("bytes={}-{}", range.start, range.end - 1) +} + +impl private::Sealed for S3Storage {} + +#[async_trait] +#[typetag::serde] +impl Storage for S3Storage { + fn can_write(&self) -> bool { + self.can_write + } + + #[instrument(skip_all)] + async fn fetch_config( + &self, + settings: &Settings, + ) -> StorageResult { + let key = self.get_config_path()?; + let res = self + .get_client(settings) + .await + .get_object() + .bucket(self.bucket.clone()) + .key(key) + .send() + .await; + + match res { + Ok(output) => match output.e_tag { + Some(etag) => Ok(FetchConfigResult::Found { + bytes: output.body.collect().await.map_err(Box::new)?.into_bytes(), + version: VersionInfo::from_etag_only(etag), + }), + None => Ok(FetchConfigResult::NotFound), + }, + Err(sdk_err) => match sdk_err.as_service_error() { + Some(e) if e.is_no_such_key() => Ok(FetchConfigResult::NotFound), + Some(_) + if sdk_err + .raw_response() + .is_some_and(|x| x.status().as_u16() == 404) => + { + // needed for Cloudflare R2 public bucket URLs + // if config doesn't exist we get a 404 that isn't parsed by the AWS SDK + // into anything useful. So we need to parse the raw response, and match + // the status code. + Ok(FetchConfigResult::NotFound) + } + _ => Err(Box::new(sdk_err).into()), + }, + } + } + + #[instrument(skip(self, settings, config))] + async fn update_config( + &self, + settings: &Settings, + config: Bytes, + previous_version: &VersionInfo, + ) -> StorageResult { + let key = self.get_config_path()?; + let mut req = self + .get_client(settings) + .await + .put_object() + .bucket(self.bucket.clone()) + .key(key) + .body(config.into()); + + if settings.unsafe_use_metadata() { + req = req.content_type("application/yaml") + } + + if let Some(klass) = settings.metadata_storage_class() { + req = req.storage_class(klass.as_str().into()) + } + + match ( + previous_version.etag(), + settings.unsafe_use_conditional_create(), + settings.unsafe_use_conditional_update(), + ) { + (None, true, _) => req = req.if_none_match("*"), + (Some(etag), _, true) => req = req.if_match(strip_quotes(etag)), + (_, _, _) => {} + } + + let res = req.send().await; + + match res { + Ok(out) => { + let new_etag = out + .e_tag() + .ok_or(StorageErrorKind::Other( + "Config object should have an etag".to_string(), + ))? + .to_string(); + let new_version = VersionInfo::from_etag_only(new_etag); + Ok(UpdateConfigResult::Updated { new_version }) + } + // minio returns this + Err(SdkError::ServiceError(err)) => { + if err.err().meta().code() == Some("PreconditionFailed") { + Ok(UpdateConfigResult::NotOnLatestVersion) + } else { + Err(StorageError::from(Box::new( + SdkError::::ServiceError(err), + ))) + } + } + // S3 API documents this + Err(SdkError::ResponseError(err)) => { + let status = err.raw().status().as_u16(); + // see https://docs.aws.amazon.com/AmazonS3/latest/API/API_PutObject.html#API_PutObject_RequestSyntax + if status == 409 || status == 412 { + Ok(UpdateConfigResult::NotOnLatestVersion) + } else { + Err(StorageError::from(Box::new( + SdkError::::ResponseError(err), + ))) + } + } + Err(err) => Err(Box::new(err).into()), + } + } + + #[instrument(skip(self, settings))] + async fn fetch_snapshot( + &self, + settings: &Settings, + id: &SnapshotId, + ) -> StorageResult> { + let key = self.get_snapshot_path(id)?; + self.get_object_reader(settings, key.as_str()).await + } + + #[instrument(skip(self, settings))] + async fn fetch_manifest_known_size( + &self, + settings: &Settings, + id: &ManifestId, + size: u64, + ) -> StorageResult { + let key = self.get_manifest_path(id)?; + self.get_object_concurrently(settings, key.as_str(), &(0..size)).await + } + + #[instrument(skip(self, settings))] + async fn fetch_manifest_unknown_size( + &self, + settings: &Settings, + id: &ManifestId, + ) -> StorageResult> { + let key = self.get_manifest_path(id)?; + self.get_object_reader(settings, key.as_str()).await + } + + #[instrument(skip(self, settings))] + async fn fetch_transaction_log( + &self, + settings: &Settings, + id: &SnapshotId, + ) -> StorageResult> { + let key = self.get_transaction_path(id)?; + self.get_object_reader(settings, key.as_str()).await + } + + #[instrument(skip(self, settings))] + async fn fetch_chunk( + &self, + settings: &Settings, + id: &ChunkId, + range: &Range, + ) -> StorageResult { + let key = self.get_chunk_path(id)?; + self.get_object_concurrently(settings, key.as_str(), range) + .await? + .to_bytes((range.end - range.start) as usize) + .await + } + + #[instrument(skip(self, settings, metadata, bytes))] + async fn write_snapshot( + &self, + settings: &Settings, + id: SnapshotId, + metadata: Vec<(String, String)>, + bytes: Bytes, + ) -> StorageResult<()> { + let key = self.get_snapshot_path(&id)?; + self.put_object( + settings, + key.as_str(), + None::, + metadata, + settings.metadata_storage_class(), + &bytes, + ) + .await + } + + #[instrument(skip(self, settings, metadata, bytes))] + async fn write_manifest( + &self, + settings: &Settings, + id: ManifestId, + metadata: Vec<(String, String)>, + bytes: Bytes, + ) -> StorageResult<()> { + let key = self.get_manifest_path(&id)?; + self.put_object( + settings, + key.as_str(), + None::, + metadata.into_iter(), + settings.metadata_storage_class(), + &bytes, + ) + .await + } + + #[instrument(skip(self, settings, metadata, bytes))] + async fn write_transaction_log( + &self, + settings: &Settings, + id: SnapshotId, + metadata: Vec<(String, String)>, + bytes: Bytes, + ) -> StorageResult<()> { + let key = self.get_transaction_path(&id)?; + self.put_object( + settings, + key.as_str(), + None::, + metadata.into_iter(), + settings.metadata_storage_class(), + &bytes, + ) + .await + } + + #[instrument(skip(self, settings, bytes))] + async fn write_chunk( + &self, + settings: &Settings, + id: ChunkId, + bytes: bytes::Bytes, + ) -> Result<(), StorageError> { + let key = self.get_chunk_path(&id)?; + let metadata: [(String, String); 0] = []; + self.put_object( + settings, + key.as_str(), + None::, + metadata, + settings.chunks_storage_class(), + &bytes, + ) + .await + } + + #[instrument(skip(self, settings))] + async fn get_ref( + &self, + settings: &Settings, + ref_key: &str, + ) -> StorageResult { + let key = self.ref_key(ref_key)?; + let res = self + .get_client(settings) + .await + .get_object() + .bucket(self.bucket.clone()) + .key(key.clone()) + .send() + .await; + + match res { + Ok(res) => { + let bytes = res.body.collect().await.map_err(Box::new)?.into_bytes(); + if let Some(version) = res.e_tag.map(VersionInfo::from_etag_only) { + Ok(GetRefResult::Found { bytes, version }) + } else { + Ok(GetRefResult::NotFound) + } + } + Err(err) + if err + .as_service_error() + .map(|e| e.is_no_such_key()) + .unwrap_or(false) => + { + Ok(GetRefResult::NotFound) + } + Err(err) => Err(Box::new(err).into()), + } + } + + #[instrument(skip_all)] + async fn ref_names(&self, settings: &Settings) -> StorageResult> { + let prefix = self.ref_key("")?; + let mut paginator = self + .get_client(settings) + .await + .list_objects_v2() + .bucket(self.bucket.clone()) + .prefix(prefix.clone()) + .into_paginator() + .send(); + + let mut res = Vec::new(); + + while let Some(page) = paginator.try_next().await.map_err(Box::new)? { + for obj in page.contents.unwrap_or_else(Vec::new) { + let name = self.get_ref_name(obj.key()); + if let Some(name) = name { + res.push(name.to_string()); + } else { + tracing::error!(object = ?obj, "Bad ref name") + } + } + } + + Ok(res) + } + + #[instrument(skip(self, settings, bytes))] + async fn write_ref( + &self, + settings: &Settings, + ref_key: &str, + bytes: Bytes, + previous_version: &VersionInfo, + ) -> StorageResult { + let key = self.ref_key(ref_key)?; + let mut builder = self + .get_client(settings) + .await + .put_object() + .bucket(self.bucket.clone()) + .key(key.clone()); + + match ( + previous_version.etag(), + settings.unsafe_use_conditional_create(), + settings.unsafe_use_conditional_update(), + ) { + (None, true, _) => { + builder = builder.if_none_match("*"); + } + (Some(etag), _, true) => { + builder = builder.if_match(strip_quotes(etag)); + } + (_, _, _) => {} + } + + if let Some(klass) = settings.metadata_storage_class() { + builder = builder.storage_class(klass.as_str().into()) + } + + let res = builder.body(bytes.into()).send().await; + + match res { + Ok(_) => Ok(WriteRefResult::Written), + Err(err) => { + let code = err.as_service_error().and_then(|e| e.code()).unwrap_or(""); + if code.contains("PreconditionFailed") + || code.contains("ConditionalRequestConflict") + { + Ok(WriteRefResult::WontOverwrite) + } else { + Err(Box::new(err).into()) + } + } + } + } + + #[instrument(skip(self, settings))] + async fn list_objects<'a>( + &'a self, + settings: &Settings, + prefix: &str, + ) -> StorageResult>>> { + let prefix = format!("{}/{}", self.prefix, prefix).replace("//", "/"); + let stream = self + .get_client(settings) + .await + .list_objects_v2() + .bucket(self.bucket.clone()) + .prefix(prefix) + .into_paginator() + .send() + .into_stream_03x() + .map_err(Box::new) + .try_filter_map(|page| { + let contents = page.contents.map(|cont| stream::iter(cont).map(Ok)); + ready(Ok(contents)) + }) + .try_flatten() + .try_filter_map(|object| async move { + let info = object_to_list_info(&object); + if info.is_none() { + tracing::error!(object=?object, "Found bad object while listing"); + } + Ok(info) + }); + Ok(stream.boxed()) + } + + #[instrument(skip(self, batch))] + async fn delete_batch( + &self, + settings: &Settings, + prefix: &str, + batch: Vec<(String, u64)>, + ) -> StorageResult { + let mut sizes = HashMap::new(); + let mut ids = Vec::new(); + for (id, size) in batch.into_iter() { + if let Ok(key) = self.get_path_str(prefix, id.as_str()) { + if let Ok(ident) = ObjectIdentifier::builder().key(key.clone()).build() { + ids.push(ident); + sizes.insert(key, size); + } + } + } + + let delete = Delete::builder() + .set_objects(Some(ids)) + .build() + .map_err(|e| StorageErrorKind::Other(e.to_string()))?; + + let res = self + .get_client(settings) + .await + .delete_objects() + .bucket(self.bucket.clone()) + .delete(delete) + .send() + .await + .map_err(Box::new)?; + + if let Some(err) = res.errors.as_ref().and_then(|e| e.first()) { + tracing::error!( + error = ?err, + "Errors deleting objects", + ); + } + + let mut result = DeleteObjectsResult::default(); + for deleted in res.deleted() { + if let Some(key) = deleted.key() { + let size = sizes.get(key).unwrap_or(&0); + result.deleted_bytes += *size; + result.deleted_objects += 1; + } else { + tracing::error!("Deleted object without key"); + } + } + Ok(result) + } + + #[instrument(skip(self, settings))] + async fn get_snapshot_last_modified( + &self, + settings: &Settings, + snapshot: &SnapshotId, + ) -> StorageResult> { + let key = self.get_snapshot_path(snapshot)?; + let res = self + .get_client(settings) + .await + .head_object() + .bucket(self.bucket.clone()) + .key(key) + .send() + .await + .map_err(Box::new)?; + + let res = res.last_modified.ok_or(StorageErrorKind::Other( + "Object has no last_modified field".to_string(), + ))?; + let res = res.to_chrono_utc().map_err(|_| { + StorageErrorKind::Other("Invalid metadata timestamp".to_string()) + })?; + + Ok(res) + } + + #[instrument(skip(self))] + async fn get_object_range_buf( + &self, + settings: &Settings, + key: &str, + range: &Range, + ) -> StorageResult> { + let b = self + .get_client(settings) + .await + .get_object() + .bucket(self.bucket.clone()) + .key(key) + .range(range_to_header(range)); + Ok(Box::new( + b.send().await.map_err(Box::new)?.body.collect().await.map_err(Box::new)?, + )) + } + + #[instrument(skip(self))] + async fn get_object_range_read( + &self, + settings: &Settings, + key: &str, + range: &Range, + ) -> StorageResult> { + let client = self.get_client(settings).await; + let bucket = self.bucket.clone(); + Ok(Box::new(get_object_range(client.as_ref(), bucket, key, range).await?)) + } +} + +fn object_to_list_info(object: &Object) -> Option> { + let key = object.key()?; + let last_modified = object.last_modified()?; + let created_at = last_modified.to_chrono_utc().ok()?; + let id = Path::new(key).file_name().and_then(|s| s.to_str())?.to_string(); + let size_bytes = object.size.unwrap_or(0) as u64; + Some(ListInfo { id, created_at, size_bytes }) +} + +#[derive(Debug)] +struct ProvideRefreshableCredentials(Arc); + +impl ProvideCredentials for ProvideRefreshableCredentials { + fn provide_credentials<'a>( + &'a self, + ) -> aws_credential_types::provider::future::ProvideCredentials<'a> + where + Self: 'a, + { + aws_credential_types::provider::future::ProvideCredentials::new(self.provide()) + } +} + +impl ProvideRefreshableCredentials { + async fn provide( + &self, + ) -> Result { + let creds = self + .0 + .get() + .await + .inspect_err(|err| error!(error = err, "Cannot load credentials")) + .map_err(CredentialsError::not_loaded)?; + let creds = aws_credential_types::Credentials::new( + creds.access_key_id, + creds.secret_access_key, + creds.session_token, + creds.expires_after.map(|e| e.into()), + "user", + ); + Ok(creds) + } +} + +async fn get_object_range( + client: &Client, + bucket: String, + key: &str, + range: &Range, +) -> StorageResult> { + let b = client.get_object().bucket(bucket).key(key).range(range_to_header(range)); + Ok(b.send().await.map_err(Box::new)?.body.into_async_read()) +} + +fn strip_quotes(s: &str) -> &str { + s.strip_prefix('"').and_then(|s| s.strip_suffix('"')).unwrap_or(s) +} + +#[cfg(test)] +#[allow(clippy::unwrap_used)] +mod tests { + use icechunk_macros::tokio_test; + + use crate::config::{S3Credentials, S3Options, S3StaticCredentials}; + + use super::*; + + #[tokio_test] + async fn test_serialize_s3_storage() { + let config = S3Options { + region: Some("us-west-2".to_string()), + endpoint_url: Some("http://localhost:9000".to_string()), + allow_http: true, + anonymous: false, + force_path_style: false, + }; + let credentials = S3Credentials::Static(S3StaticCredentials { + access_key_id: "access_key_id".to_string(), + secret_access_key: "secret_access_key".to_string(), + session_token: Some("session_token".to_string()), + expires_after: None, + }); + let storage = S3Storage::new( + config, + "bucket".to_string(), + Some("prefix".to_string()), + credentials, + true, + Vec::new(), + Vec::new(), + ) + .unwrap(); + + let serialized = serde_json::to_string(&storage).unwrap(); + + assert_eq!( + serialized, + r#"{"config":{"region":"us-west-2","endpoint_url":"http://localhost:9000","anonymous":false,"allow_http":true,"force_path_style":false},"credentials":{"s3_credential_type":"static","access_key_id":"access_key_id","secret_access_key":"secret_access_key","session_token":"session_token","expires_after":null},"bucket":"bucket","prefix":"prefix","can_write":true,"extra_read_headers":[],"extra_write_headers":[]}"# + ); + + let deserialized: S3Storage = serde_json::from_str(&serialized).unwrap(); + assert_eq!(storage.config, deserialized.config); + } + + #[tokio_test] + async fn test_s3_paths() { + let storage = S3Storage::new( + S3Options { + region: Some("us-west-2".to_string()), + endpoint_url: None, + allow_http: true, + anonymous: false, + force_path_style: false, + }, + "bucket".to_string(), + Some("prefix".to_string()), + S3Credentials::FromEnv, + true, + Vec::new(), + Vec::new(), + ) + .unwrap(); + + let ref_path = storage.ref_key("ref_key").unwrap(); + assert_eq!(ref_path, "prefix/refs/ref_key"); + + let snapshot_id = SnapshotId::random(); + let snapshot_path = storage.get_snapshot_path(&snapshot_id).unwrap(); + assert_eq!(snapshot_path, format!("prefix/snapshots/{snapshot_id}")); + + let manifest_id = ManifestId::random(); + let manifest_path = storage.get_manifest_path(&manifest_id).unwrap(); + assert_eq!(manifest_path, format!("prefix/manifests/{manifest_id}")); + + let chunk_id = ChunkId::random(); + let chunk_path = storage.get_chunk_path(&chunk_id).unwrap(); + assert_eq!(chunk_path, format!("prefix/chunks/{chunk_id}")); + + let transaction_id = SnapshotId::random(); + let transaction_path = storage.get_transaction_path(&transaction_id).unwrap(); + assert_eq!(transaction_path, format!("prefix/transactions/{transaction_id}")); + } +} diff --git a/icechunk/src/storage/mod.rs b/icechunk/src/storage/mod.rs index 1c82625c3..64e69014a 100644 --- a/icechunk/src/storage/mod.rs +++ b/icechunk/src/storage/mod.rs @@ -1,756 +1,35 @@ -use ::object_store::{azure::AzureConfigKey, gcp::GoogleConfigKey}; -use aws_sdk_s3::{ - config::http::HttpResponse, - error::SdkError, - operation::{ - complete_multipart_upload::CompleteMultipartUploadError, - create_multipart_upload::CreateMultipartUploadError, - delete_objects::DeleteObjectsError, get_object::GetObjectError, - head_object::HeadObjectError, list_objects_v2::ListObjectsV2Error, - put_object::PutObjectError, upload_part::UploadPartError, - }, - primitives::ByteStreamError, -}; -use chrono::{DateTime, Utc}; -use core::fmt; -use futures::{ - Stream, StreamExt, TryStreamExt, - stream::{BoxStream, FuturesOrdered}, -}; +use futures::{Stream, StreamExt, TryStreamExt, stream::BoxStream}; use itertools::Itertools; -use s3::S3Storage; -use serde::{Deserialize, Serialize}; use std::{ cmp::{max, min}, - collections::HashMap, - ffi::OsString, - io::Read, iter, - num::{NonZeroU16, NonZeroU64}, ops::Range, - path::Path, - sync::{Arc, Mutex, OnceLock}, -}; -use tokio::io::AsyncRead; -use tokio_util::io::SyncIoBridge; -use tracing::{instrument, warn}; - -use async_trait::async_trait; -use bytes::{Buf, Bytes}; -use thiserror::Error; - -#[cfg(test)] -pub mod logging; - -pub mod object_store; -pub mod s3; - -pub use object_store::ObjectStorage; - -use crate::{ - config::{AzureCredentials, GcsCredentials, S3Credentials, S3Options}, - error::ICError, - format::{ChunkId, ChunkOffset, ManifestId, SnapshotId}, - private, + sync::Arc, }; -#[derive(Debug, Error)] -pub enum StorageErrorKind { - #[error("object store error {0}")] - ObjectStore(#[from] Box<::object_store::Error>), - #[error("bad object store prefix {0:?}")] - BadPrefix(OsString), - #[error("error getting object from object store {0}")] - S3GetObjectError(#[from] Box>), - #[error("error writing object to object store {0}")] - S3PutObjectError(#[from] Box>), - #[error("error creating multipart upload {0}")] - S3CreateMultipartUploadError( - #[from] Box>, - ), - #[error("error uploading multipart part {0}")] - S3UploadPartError(#[from] Box>), - #[error("error completing multipart upload {0}")] - S3CompleteMultipartUploadError( - #[from] Box>, - ), - #[error("error getting object metadata from object store {0}")] - S3HeadObjectError(#[from] Box>), - #[error("error listing objects in object store {0}")] - S3ListObjectError(#[from] Box>), - #[error("error deleting objects in object store {0}")] - S3DeleteObjectError(#[from] Box>), - #[error("error streaming bytes from object store {0}")] - S3StreamError(#[from] Box), - #[error("I/O error: {0}")] - IOError(#[from] std::io::Error), - #[error("storage configuration error: {0}")] - R2ConfigurationError(String), - #[error("storage error: {0}")] - Other(String), -} -pub type StorageError = ICError; - -// it would be great to define this impl in error.rs, but it conflicts with the blanket -// `impl From for T` -impl From for StorageError -where - E: Into, -{ - fn from(value: E) -> Self { - Self::new(value.into()) - } -} - -pub type StorageResult = Result; - -#[derive(Debug)] -pub struct ListInfo { - pub id: Id, - pub created_at: DateTime, - pub size_bytes: u64, -} - -const SNAPSHOT_PREFIX: &str = "snapshots/"; -const MANIFEST_PREFIX: &str = "manifests/"; -// const ATTRIBUTES_PREFIX: &str = "attributes/"; -const CHUNK_PREFIX: &str = "chunks/"; -const REF_PREFIX: &str = "refs"; -const TRANSACTION_PREFIX: &str = "transactions/"; -const CONFIG_PATH: &str = "config.yaml"; - -#[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Clone, Hash, PartialOrd, Ord)] -pub struct ETag(pub String); -#[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Clone, Default)] -pub struct Generation(pub String); - -#[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Clone)] -pub struct VersionInfo { - pub etag: Option, - pub generation: Option, -} - -impl VersionInfo { - pub fn for_creation() -> Self { - Self { etag: None, generation: None } - } - - pub fn from_etag_only(etag: String) -> Self { - Self { etag: Some(ETag(etag)), generation: None } - } - - pub fn is_create(&self) -> bool { - self.etag.is_none() && self.generation.is_none() - } - - pub fn etag(&self) -> Option<&String> { - self.etag.as_ref().map(|e| &e.0) - } - - pub fn generation(&self) -> Option<&String> { - self.generation.as_ref().map(|e| &e.0) - } -} - -#[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Clone, Default)] -pub struct RetriesSettings { - pub max_tries: Option, - pub initial_backoff_ms: Option, - pub max_backoff_ms: Option, -} - -impl RetriesSettings { - pub fn max_tries(&self) -> NonZeroU16 { - self.max_tries.unwrap_or_else(|| NonZeroU16::new(10).unwrap_or(NonZeroU16::MIN)) - } - - pub fn initial_backoff_ms(&self) -> u32 { - self.initial_backoff_ms.unwrap_or(100) - } - - pub fn max_backoff_ms(&self) -> u32 { - self.max_backoff_ms.unwrap_or(3 * 60 * 1000) - } - - pub fn merge(&self, other: Self) -> Self { - Self { - max_tries: other.max_tries.or(self.max_tries), - initial_backoff_ms: other.initial_backoff_ms.or(self.initial_backoff_ms), - max_backoff_ms: other.max_backoff_ms.or(self.max_backoff_ms), - } - } -} - -#[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Clone, Default)] -pub struct ConcurrencySettings { - pub max_concurrent_requests_for_object: Option, - pub ideal_concurrent_request_size: Option, -} - -impl ConcurrencySettings { - // AWS recommendations: https://docs.aws.amazon.com/whitepapers/latest/s3-optimizing-performance-best-practices/horizontal-scaling-and-request-parallelization-for-high-throughput.html - // 8-16 MB requests - // 85-90 MB/s per request - // these numbers would saturate a 12.5 Gbps network - - pub fn max_concurrent_requests_for_object(&self) -> NonZeroU16 { - self.max_concurrent_requests_for_object - .unwrap_or_else(|| NonZeroU16::new(18).unwrap_or(NonZeroU16::MIN)) - } - pub fn ideal_concurrent_request_size(&self) -> NonZeroU64 { - self.ideal_concurrent_request_size.unwrap_or_else(|| { - NonZeroU64::new(12 * 1024 * 1024).unwrap_or(NonZeroU64::MIN) - }) - } - - pub fn merge(&self, other: Self) -> Self { - Self { - max_concurrent_requests_for_object: other - .max_concurrent_requests_for_object - .or(self.max_concurrent_requests_for_object), - ideal_concurrent_request_size: other - .ideal_concurrent_request_size - .or(self.ideal_concurrent_request_size), - } - } -} - -#[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Clone, Default)] -pub struct Settings { - pub concurrency: Option, - pub retries: Option, - pub unsafe_use_conditional_update: Option, - pub unsafe_use_conditional_create: Option, - pub unsafe_use_metadata: Option, - #[serde(default)] - pub storage_class: Option, - #[serde(default)] - pub metadata_storage_class: Option, - #[serde(default)] - pub chunks_storage_class: Option, - #[serde(default)] - pub minimum_size_for_multipart_upload: Option, -} - -static DEFAULT_CONCURRENCY: OnceLock = OnceLock::new(); -static DEFAULT_RETRIES: OnceLock = OnceLock::new(); - -impl Settings { - pub fn concurrency(&self) -> &ConcurrencySettings { - self.concurrency - .as_ref() - .unwrap_or_else(|| DEFAULT_CONCURRENCY.get_or_init(Default::default)) - } - - pub fn retries(&self) -> &RetriesSettings { - self.retries - .as_ref() - .unwrap_or_else(|| DEFAULT_RETRIES.get_or_init(Default::default)) - } - - pub fn unsafe_use_conditional_create(&self) -> bool { - self.unsafe_use_conditional_create.unwrap_or(true) - } - - pub fn unsafe_use_conditional_update(&self) -> bool { - self.unsafe_use_conditional_update.unwrap_or(true) - } - - pub fn unsafe_use_metadata(&self) -> bool { - self.unsafe_use_metadata.unwrap_or(true) - } - - pub fn metadata_storage_class(&self) -> Option<&String> { - self.metadata_storage_class.as_ref().or(self.storage_class.as_ref()) - } - - pub fn chunks_storage_class(&self) -> Option<&String> { - self.chunks_storage_class.as_ref().or(self.storage_class.as_ref()) - } - - pub fn minimum_size_for_multipart_upload(&self) -> u64 { - // per AWS recommendation: 100 MB - self.minimum_size_for_multipart_upload.unwrap_or(100 * 1024 * 1024) - } - - pub fn merge(&self, other: Self) -> Self { - Self { - concurrency: match (&self.concurrency, other.concurrency) { - (None, None) => None, - (None, Some(c)) => Some(c), - (Some(c), None) => Some(c.clone()), - (Some(mine), Some(theirs)) => Some(mine.merge(theirs)), - }, - retries: match (&self.retries, other.retries) { - (None, None) => None, - (None, Some(c)) => Some(c), - (Some(c), None) => Some(c.clone()), - (Some(mine), Some(theirs)) => Some(mine.merge(theirs)), - }, - unsafe_use_conditional_create: match ( - &self.unsafe_use_conditional_create, - other.unsafe_use_conditional_create, - ) { - (None, None) => None, - (None, Some(c)) => Some(c), - (Some(c), None) => Some(*c), - (Some(_), Some(theirs)) => Some(theirs), - }, - unsafe_use_conditional_update: match ( - &self.unsafe_use_conditional_update, - other.unsafe_use_conditional_update, - ) { - (None, None) => None, - (None, Some(c)) => Some(c), - (Some(c), None) => Some(*c), - (Some(_), Some(theirs)) => Some(theirs), - }, - unsafe_use_metadata: match ( - &self.unsafe_use_metadata, - other.unsafe_use_metadata, - ) { - (None, None) => None, - (None, Some(c)) => Some(c), - (Some(c), None) => Some(*c), - (Some(_), Some(theirs)) => Some(theirs), - }, - storage_class: match (&self.storage_class, other.storage_class) { - (None, None) => None, - (None, Some(c)) => Some(c), - (Some(c), None) => Some(c.clone()), - (Some(_), Some(theirs)) => Some(theirs), - }, - metadata_storage_class: match ( - &self.metadata_storage_class, - other.metadata_storage_class, - ) { - (None, None) => None, - (None, Some(c)) => Some(c), - (Some(c), None) => Some(c.clone()), - (Some(_), Some(theirs)) => Some(theirs), - }, - chunks_storage_class: match ( - &self.chunks_storage_class, - other.chunks_storage_class, - ) { - (None, None) => None, - (None, Some(c)) => Some(c), - (Some(c), None) => Some(c.clone()), - (Some(_), Some(theirs)) => Some(theirs), - }, - minimum_size_for_multipart_upload: match ( - &self.minimum_size_for_multipart_upload, - other.minimum_size_for_multipart_upload, - ) { - (None, None) => None, - (None, Some(c)) => Some(c), - (Some(c), None) => Some(*c), - (Some(_), Some(theirs)) => Some(theirs), - }, - } - } -} - -pub enum Reader { - Asynchronous(Box), - Synchronous(Box), -} - -impl Reader { - pub async fn to_bytes(self, expected_size: usize) -> StorageResult { - match self { - Reader::Asynchronous(mut read) => { - // add some extra space to the buffer to optimize conversion to bytes - let mut buffer = Vec::with_capacity(expected_size + 16); - tokio::io::copy(&mut read, &mut buffer) - .await - .map_err(StorageErrorKind::IOError)?; - Ok(buffer.into()) - } - Reader::Synchronous(mut buf) => Ok(buf.copy_to_bytes(buf.remaining())), - } - } - - /// Notice this Read can only be used in non async contexts, for example, calling tokio::task::spawn_blocking - pub fn into_read(self) -> Box { - match self { - Reader::Asynchronous(read) => Box::new(SyncIoBridge::new(read)), - Reader::Synchronous(buf) => Box::new(buf.reader()), - } - } -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum FetchConfigResult { - Found { bytes: Bytes, version: VersionInfo }, - NotFound, -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum UpdateConfigResult { - Updated { new_version: VersionInfo }, - NotOnLatestVersion, -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum GetRefResult { - Found { bytes: Bytes, version: VersionInfo }, - NotFound, -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub enum WriteRefResult { - Written, - WontOverwrite, -} - -#[derive(Debug, Clone, PartialEq, Eq, Default)] -pub struct DeleteObjectsResult { - pub deleted_objects: u64, - pub deleted_bytes: u64, -} - -impl DeleteObjectsResult { - pub fn merge(&mut self, other: &Self) { - self.deleted_objects += other.deleted_objects; - self.deleted_bytes += other.deleted_bytes; - } -} - -/// Fetch and write the parquet files that represent the repository in object store -/// -/// Different implementation can cache the files differently, or not at all. -/// Implementations are free to assume files are never overwritten. -#[async_trait] -#[typetag::serde(tag = "type")] -pub trait Storage: fmt::Debug + fmt::Display + private::Sealed + Sync + Send { - fn default_settings(&self) -> Settings { - Default::default() - } - - fn can_write(&self) -> bool; - - async fn fetch_config(&self, settings: &Settings) - -> StorageResult; - async fn update_config( - &self, - settings: &Settings, - config: Bytes, - previous_version: &VersionInfo, - ) -> StorageResult; - async fn fetch_snapshot( - &self, - settings: &Settings, - id: &SnapshotId, - ) -> StorageResult>; - /// Returns whatever reader is more efficient. - /// - /// For example, if processed with multiple requests, it will return a synchronous `Buf` - /// instance pointing the different parts. If it was executed in a single request, it's more - /// efficient to return the network `AsyncRead` directly - async fn fetch_manifest_known_size( - &self, - settings: &Settings, - id: &ManifestId, - size: u64, - ) -> StorageResult; - async fn fetch_manifest_unknown_size( - &self, - settings: &Settings, - id: &ManifestId, - ) -> StorageResult>; - async fn fetch_chunk( - &self, - settings: &Settings, - id: &ChunkId, - range: &Range, - ) -> StorageResult; // FIXME: format flags - async fn fetch_transaction_log( - &self, - settings: &Settings, - id: &SnapshotId, - ) -> StorageResult>; - - async fn write_snapshot( - &self, - settings: &Settings, - id: SnapshotId, - metadata: Vec<(String, String)>, - bytes: Bytes, - ) -> StorageResult<()>; - async fn write_manifest( - &self, - settings: &Settings, - id: ManifestId, - metadata: Vec<(String, String)>, - bytes: Bytes, - ) -> StorageResult<()>; - async fn write_chunk( - &self, - settings: &Settings, - id: ChunkId, - bytes: Bytes, - ) -> StorageResult<()>; - async fn write_transaction_log( - &self, - settings: &Settings, - id: SnapshotId, - metadata: Vec<(String, String)>, - bytes: Bytes, - ) -> StorageResult<()>; - - async fn get_ref( - &self, - settings: &Settings, - ref_key: &str, - ) -> StorageResult; - async fn ref_names(&self, settings: &Settings) -> StorageResult>; - async fn write_ref( - &self, - settings: &Settings, - ref_key: &str, - bytes: Bytes, - previous_version: &VersionInfo, - ) -> StorageResult; +#[cfg(not(target_arch = "wasm32"))] +use crate::config::{AzureCredentials, GcsCredentials, S3Credentials, S3Options}; - async fn list_objects<'a>( - &'a self, - settings: &Settings, - prefix: &str, - ) -> StorageResult>>>; +// Core modules - always available +pub mod errors; +pub mod traits; +pub mod types; - async fn delete_batch( - &self, - settings: &Settings, - prefix: &str, - batch: Vec<(String, u64)>, - ) -> StorageResult; +// Implementation modules - conditionally compiled +pub mod implementations; - /// Delete a stream of objects, by their id string representations - /// Input stream includes sizes to get as result the total number of bytes deleted - #[instrument(skip(self, settings, ids))] - async fn delete_objects( - &self, - settings: &Settings, - prefix: &str, - ids: BoxStream<'_, (String, u64)>, - ) -> StorageResult { - let res = Arc::new(Mutex::new(DeleteObjectsResult::default())); - ids.chunks(1_000) - // FIXME: configurable concurrency - .for_each_concurrent(10, |batch| { - let res = Arc::clone(&res); - async move { - let new_deletes = self - .delete_batch(settings, prefix, batch) - .await - .unwrap_or_else(|_| { - // FIXME: handle error instead of skipping - warn!("ignoring error in Storage::delete_batch"); - Default::default() - }); - #[allow(clippy::expect_used)] - res.lock().expect("Bug in delete objects").merge(&new_deletes); - } - }) - .await; - #[allow(clippy::expect_used)] - let res = res.lock().expect("Bug in delete objects"); - Ok(res.clone()) - } - - async fn get_snapshot_last_modified( - &self, - settings: &Settings, - snapshot: &SnapshotId, - ) -> StorageResult>; - - async fn root_is_clean(&self) -> StorageResult { - match self.list_objects(&Settings::default(), "").await?.next().await { - None => Ok(true), - Some(Ok(_)) => Ok(false), - Some(Err(err)) => Err(err), - } - } - - async fn list_chunks( - &self, - settings: &Settings, - ) -> StorageResult>>> { - Ok(translate_list_infos(self.list_objects(settings, CHUNK_PREFIX).await?)) - } - - async fn list_manifests( - &self, - settings: &Settings, - ) -> StorageResult>>> { - Ok(translate_list_infos(self.list_objects(settings, MANIFEST_PREFIX).await?)) - } - - async fn list_snapshots( - &self, - settings: &Settings, - ) -> StorageResult>>> { - Ok(translate_list_infos(self.list_objects(settings, SNAPSHOT_PREFIX).await?)) - } - - async fn list_transaction_logs( - &self, - settings: &Settings, - ) -> StorageResult>>> { - Ok(translate_list_infos(self.list_objects(settings, TRANSACTION_PREFIX).await?)) - } - - async fn delete_chunks( - &self, - settings: &Settings, - chunks: BoxStream<'_, (ChunkId, u64)>, - ) -> StorageResult { - self.delete_objects( - settings, - CHUNK_PREFIX, - chunks.map(|(id, size)| (id.to_string(), size)).boxed(), - ) - .await - } - - async fn delete_manifests( - &self, - settings: &Settings, - manifests: BoxStream<'_, (ManifestId, u64)>, - ) -> StorageResult { - self.delete_objects( - settings, - MANIFEST_PREFIX, - manifests.map(|(id, size)| (id.to_string(), size)).boxed(), - ) - .await - } - - async fn delete_snapshots( - &self, - settings: &Settings, - snapshots: BoxStream<'_, (SnapshotId, u64)>, - ) -> StorageResult { - self.delete_objects( - settings, - SNAPSHOT_PREFIX, - snapshots.map(|(id, size)| (id.to_string(), size)).boxed(), - ) - .await - } - - async fn delete_transaction_logs( - &self, - settings: &Settings, - transaction_logs: BoxStream<'_, (SnapshotId, u64)>, - ) -> StorageResult { - self.delete_objects( - settings, - TRANSACTION_PREFIX, - transaction_logs.map(|(id, size)| (id.to_string(), size)).boxed(), - ) - .await - } - - async fn delete_refs( - &self, - settings: &Settings, - refs: BoxStream<'_, String>, - ) -> StorageResult { - let refs = refs.map(|s| (s, 0)).boxed(); - Ok(self.delete_objects(settings, REF_PREFIX, refs).await?.deleted_objects) - } - - async fn get_object_range_buf( - &self, - settings: &Settings, - key: &str, - range: &Range, - ) -> StorageResult>; - - async fn get_object_range_read( - &self, - settings: &Settings, - key: &str, - range: &Range, - ) -> StorageResult>; - - async fn get_object_concurrently_multiple( - &self, - settings: &Settings, - key: &str, - parts: Vec>, - ) -> StorageResult> { - let results = - parts - .into_iter() - .map(|range| async move { - self.get_object_range_buf(settings, key, &range).await - }) - .collect::>(); - - let init: Box = Box::new(&[][..]); - let buf = results - .try_fold(init, |prev, buf| async { - let res: Box = Box::new(prev.chain(buf)); - Ok(res) - }) - .await?; +// Test module +#[cfg(test)] +pub mod logging; - Ok(Box::new(buf)) - } +// Re-export core types and traits +pub use errors::*; +pub use traits::Storage; +pub use types::*; - async fn get_object_concurrently( - &self, - settings: &Settings, - key: &str, - range: &Range, - ) -> StorageResult { - let parts = split_in_multiple_requests( - range, - settings.concurrency().ideal_concurrent_request_size().get(), - settings.concurrency().max_concurrent_requests_for_object().get(), - ) - .collect::>(); - - let res = match parts.len() { - 0 => Reader::Asynchronous(Box::new(tokio::io::empty())), - 1 => Reader::Asynchronous( - self.get_object_range_read(settings, key, range).await?, - ), - _ => Reader::Synchronous( - self.get_object_concurrently_multiple(settings, key, parts).await?, - ), - }; - Ok(res) - } -} - -fn convert_list_item(item: ListInfo) -> Option> -where - Id: for<'b> TryFrom<&'b str>, -{ - let id = Id::try_from(item.id.as_str()).ok()?; - let created_at = item.created_at; - Some(ListInfo { created_at, id, size_bytes: item.size_bytes }) -} - -fn translate_list_infos<'a, Id>( - s: impl Stream>> + Send + 'a, -) -> BoxStream<'a, StorageResult>> -where - Id: for<'b> TryFrom<&'b str> + Send + std::fmt::Debug + 'a, -{ - s.try_filter_map(|info| async move { - let info = convert_list_item(info); - if info.is_none() { - tracing::error!(list_info=?info, "Error processing list item metadata"); - } - Ok(info) - }) - .boxed() -} +// Conditional re-exports for implementations +#[cfg(not(target_arch = "wasm32"))] +pub use implementations::{ObjectStorage, S3Storage}; /// Split an object request into multiple byte range requests /// @@ -820,12 +99,41 @@ pub fn split_in_multiple_equal_requests( .map(|(_, range)| range) } +fn convert_list_item(item: ListInfo) -> Option> +where + Id: for<'b> TryFrom<&'b str>, +{ + let id = Id::try_from(item.id.as_str()).ok()?; + let created_at = item.created_at; + Some(ListInfo { created_at, id, size_bytes: item.size_bytes }) +} + +fn translate_list_infos<'a, Id>( + s: impl Stream>> + Send + 'a, +) -> BoxStream<'a, StorageResult>> +where + Id: for<'b> TryFrom<&'b str> + Send + std::fmt::Debug + 'a, +{ + s.try_filter_map(|info| async move { + let info = convert_list_item(info); + if info.is_none() { + tracing::error!(list_info=?info, "Error processing list item metadata"); + } + Ok(info) + }) + .boxed() +} + +// Constructor functions - conditionally compiled for non-WASM targets +#[cfg(not(target_arch = "wasm32"))] pub fn new_s3_storage( config: S3Options, bucket: String, prefix: Option, credentials: Option, ) -> StorageResult> { + use implementations::s3::S3Storage; + if let Some(endpoint) = &config.endpoint_url { if endpoint.contains("fly.storage.tigris.dev") { return Err(StorageError::from(StorageErrorKind::Other("Tigris Storage is not S3 compatible, use the Tigris specific constructor instead".to_string()))); @@ -844,6 +152,7 @@ pub fn new_s3_storage( Ok(Arc::new(st)) } +#[cfg(not(target_arch = "wasm32"))] pub fn new_r2_storage( config: S3Options, bucket: Option, @@ -851,6 +160,8 @@ pub fn new_r2_storage( account_id: Option, credentials: Option, ) -> StorageResult> { + use implementations::s3::S3Storage; + let (bucket, prefix) = match (bucket, prefix) { (Some(bucket), Some(prefix)) => (bucket, Some(prefix)), (None, Some(prefix)) => match prefix.split_once("/") { @@ -893,6 +204,7 @@ pub fn new_r2_storage( Ok(Arc::new(st)) } +#[cfg(not(target_arch = "wasm32"))] pub fn new_tigris_storage( config: S3Options, bucket: String, @@ -900,6 +212,8 @@ pub fn new_tigris_storage( credentials: Option, use_weak_consistency: bool, ) -> StorageResult> { + use implementations::s3::S3Storage; + let config = S3Options { endpoint_url: Some( config.endpoint_url.unwrap_or("https://fly.storage.tigris.dev".to_string()), @@ -934,18 +248,32 @@ pub fn new_tigris_storage( Ok(Arc::new(st)) } +// WASM-compatible constructors - always available pub async fn new_in_memory_storage() -> StorageResult> { - let st = ObjectStorage::new_in_memory().await?; - Ok(Arc::new(st)) + #[cfg(not(target_arch = "wasm32"))] + { + let st = implementations::ObjectStorage::new_in_memory().await?; + Ok(Arc::new(st)) + } + #[cfg(target_arch = "wasm32")] + { + // For WASM, we'll need a different in-memory implementation + // This is a placeholder that will be replaced by WASM-specific implementation + Err(StorageError::from(StorageErrorKind::Other( + "In-memory storage not yet implemented for WASM".to_string(), + ))) + } } +#[cfg(not(target_arch = "wasm32"))] pub async fn new_local_filesystem_storage( - path: &Path, + path: &std::path::Path, ) -> StorageResult> { - let st = ObjectStorage::new_local_filesystem(path).await?; + let st = implementations::ObjectStorage::new_local_filesystem(path).await?; Ok(Arc::new(st)) } +#[cfg(not(target_arch = "wasm32"))] pub async fn new_s3_object_store_storage( config: S3Options, bucket: String, @@ -958,34 +286,46 @@ pub async fn new_s3_object_store_storage( } } let storage = - ObjectStorage::new_s3(bucket, prefix, credentials, Some(config)).await?; + implementations::ObjectStorage::new_s3(bucket, prefix, credentials, Some(config)) + .await?; Ok(Arc::new(storage)) } +#[cfg(not(target_arch = "wasm32"))] pub async fn new_azure_blob_storage( account: String, container: String, prefix: Option, credentials: Option, - config: Option>, + config: Option>, ) -> StorageResult> { + use object_store::azure::AzureConfigKey; + let config = config .unwrap_or_default() .into_iter() .filter_map(|(key, value)| key.parse::().map(|k| (k, value)).ok()) .collect(); - let storage = - ObjectStorage::new_azure(account, container, prefix, credentials, Some(config)) - .await?; + let storage = implementations::ObjectStorage::new_azure( + account, + container, + prefix, + credentials, + Some(config), + ) + .await?; Ok(Arc::new(storage)) } +#[cfg(not(target_arch = "wasm32"))] pub async fn new_gcs_storage( bucket: String, prefix: Option, credentials: Option, - config: Option>, + config: Option>, ) -> StorageResult> { + use object_store::gcp::GoogleConfigKey; + let config = config .unwrap_or_default() .into_iter() @@ -993,15 +333,19 @@ pub async fn new_gcs_storage( key.parse::().map(|k| (k, value)).ok() }) .collect(); - let storage = - ObjectStorage::new_gcs(bucket, prefix, credentials, Some(config)).await?; + let storage = implementations::ObjectStorage::new_gcs( + bucket, + prefix, + credentials, + Some(config), + ) + .await?; Ok(Arc::new(storage)) } #[cfg(test)] #[allow(clippy::unwrap_used, clippy::panic)] mod tests { - use std::{collections::HashSet, fs::File, io::Write, path::PathBuf}; use crate::config::{GcsBearerCredential, GcsStaticCredentials}; @@ -1011,6 +355,7 @@ mod tests { use proptest::prelude::*; use tempfile::TempDir; + #[cfg(not(target_arch = "wasm32"))] #[tokio_test] async fn test_is_clean() { let repo_dir = TempDir::new().unwrap(); @@ -1027,6 +372,7 @@ mod tests { assert!(s.root_is_clean().await.unwrap()); } + #[cfg(not(target_arch = "wasm32"))] #[tokio_test] /// Regression test: we can deserialize a GCS credential with token async fn test_gcs_session_serialization() { diff --git a/icechunk/src/storage/traits.rs b/icechunk/src/storage/traits.rs new file mode 100644 index 000000000..684e448f4 --- /dev/null +++ b/icechunk/src/storage/traits.rs @@ -0,0 +1,352 @@ +use async_trait::async_trait; +use bytes::Bytes; +use chrono::{DateTime, Utc}; +use core::fmt; +use futures::{StreamExt, stream::BoxStream}; +use std::ops::Range; +use tokio::io::AsyncRead; + +use crate::{ + format::{ChunkId, ChunkOffset, ManifestId, SnapshotId}, + private, +}; + +use super::{ + DeleteObjectsResult, FetchConfigResult, GetRefResult, ListInfo, Reader, Settings, + StorageResult, UpdateConfigResult, VersionInfo, WriteRefResult, +}; + +/// Fetch and write the parquet files that represent the repository in object store +/// +/// Different implementation can cache the files differently, or not at all. +/// Implementations are free to assume files are never overwritten. +#[async_trait] +#[typetag::serde(tag = "type")] +pub trait Storage: fmt::Debug + fmt::Display + private::Sealed + Sync + Send { + fn default_settings(&self) -> Settings { + Default::default() + } + + fn can_write(&self) -> bool; + + async fn fetch_config(&self, settings: &Settings) + -> StorageResult; + async fn update_config( + &self, + settings: &Settings, + config: Bytes, + previous_version: &VersionInfo, + ) -> StorageResult; + async fn fetch_snapshot( + &self, + settings: &Settings, + id: &SnapshotId, + ) -> StorageResult>; + /// Returns whatever reader is more efficient. + /// + /// For example, if processed with multiple requests, it will return a synchronous `Buf` + /// instance pointing the different parts. If it was executed in a single request, it's more + /// efficient to return the network `AsyncRead` directly + async fn fetch_manifest_known_size( + &self, + settings: &Settings, + id: &ManifestId, + size: u64, + ) -> StorageResult; + async fn fetch_manifest_unknown_size( + &self, + settings: &Settings, + id: &ManifestId, + ) -> StorageResult>; + async fn fetch_chunk( + &self, + settings: &Settings, + id: &ChunkId, + range: &Range, + ) -> StorageResult; // FIXME: format flags + async fn fetch_transaction_log( + &self, + settings: &Settings, + id: &SnapshotId, + ) -> StorageResult>; + + async fn write_snapshot( + &self, + settings: &Settings, + id: SnapshotId, + metadata: Vec<(String, String)>, + bytes: Bytes, + ) -> StorageResult<()>; + async fn write_manifest( + &self, + settings: &Settings, + id: ManifestId, + metadata: Vec<(String, String)>, + bytes: Bytes, + ) -> StorageResult<()>; + async fn write_chunk( + &self, + settings: &Settings, + id: ChunkId, + bytes: Bytes, + ) -> StorageResult<()>; + async fn write_transaction_log( + &self, + settings: &Settings, + id: SnapshotId, + metadata: Vec<(String, String)>, + bytes: Bytes, + ) -> StorageResult<()>; + + async fn get_ref( + &self, + settings: &Settings, + ref_key: &str, + ) -> StorageResult; + async fn ref_names(&self, settings: &Settings) -> StorageResult>; + async fn write_ref( + &self, + settings: &Settings, + ref_key: &str, + bytes: Bytes, + previous_version: &VersionInfo, + ) -> StorageResult; + + async fn list_objects<'a>( + &'a self, + settings: &Settings, + prefix: &str, + ) -> StorageResult>>>; + + async fn delete_batch( + &self, + settings: &Settings, + prefix: &str, + batch: Vec<(String, u64)>, + ) -> StorageResult; + + /// Delete a stream of objects, by their id string representations + /// Input stream includes sizes to get as result the total number of bytes deleted + /// Delete a stream of objects, by their id string representations + /// Input stream includes sizes to get as result the total number of bytes deleted + async fn delete_objects( + &self, + settings: &Settings, + prefix: &str, + ids: BoxStream<'_, (String, u64)>, + ) -> StorageResult { + use futures::StreamExt; + use std::sync::{Arc, Mutex}; + use tracing::warn; + + let res = Arc::new(Mutex::new(DeleteObjectsResult::default())); + ids.chunks(1_000) + // FIXME: configurable concurrency + .for_each_concurrent(10, |batch| { + let res = Arc::clone(&res); + async move { + let new_deletes = self + .delete_batch(settings, prefix, batch) + .await + .unwrap_or_else(|_| { + // FIXME: handle error instead of skipping + warn!("ignoring error in Storage::delete_batch"); + Default::default() + }); + #[allow(clippy::expect_used)] + res.lock().expect("Bug in delete objects").merge(&new_deletes); + } + }) + .await; + #[allow(clippy::expect_used)] + let res = res.lock().expect("Bug in delete objects"); + Ok(res.clone()) + } + + async fn get_snapshot_last_modified( + &self, + settings: &Settings, + snapshot: &SnapshotId, + ) -> StorageResult>; + + async fn root_is_clean(&self) -> StorageResult { + match self.list_objects(&Settings::default(), "").await?.next().await { + None => Ok(true), + Some(Ok(_)) => Ok(false), + Some(Err(err)) => Err(err), + } + } + + async fn list_chunks( + &self, + settings: &Settings, + ) -> StorageResult>>> { + use super::{CHUNK_PREFIX, translate_list_infos}; + Ok(translate_list_infos(self.list_objects(settings, CHUNK_PREFIX).await?)) + } + + async fn list_manifests( + &self, + settings: &Settings, + ) -> StorageResult>>> { + use super::{MANIFEST_PREFIX, translate_list_infos}; + Ok(translate_list_infos(self.list_objects(settings, MANIFEST_PREFIX).await?)) + } + + async fn list_snapshots( + &self, + settings: &Settings, + ) -> StorageResult>>> { + use super::{SNAPSHOT_PREFIX, translate_list_infos}; + Ok(translate_list_infos(self.list_objects(settings, SNAPSHOT_PREFIX).await?)) + } + + async fn list_transaction_logs( + &self, + settings: &Settings, + ) -> StorageResult>>> { + use super::{TRANSACTION_PREFIX, translate_list_infos}; + Ok(translate_list_infos(self.list_objects(settings, TRANSACTION_PREFIX).await?)) + } + + async fn delete_chunks( + &self, + settings: &Settings, + chunks: BoxStream<'_, (ChunkId, u64)>, + ) -> StorageResult { + use super::CHUNK_PREFIX; + use futures::StreamExt; + self.delete_objects( + settings, + CHUNK_PREFIX, + chunks.map(|(id, size)| (id.to_string(), size)).boxed(), + ) + .await + } + + async fn delete_manifests( + &self, + settings: &Settings, + manifests: BoxStream<'_, (ManifestId, u64)>, + ) -> StorageResult { + use super::MANIFEST_PREFIX; + use futures::StreamExt; + self.delete_objects( + settings, + MANIFEST_PREFIX, + manifests.map(|(id, size)| (id.to_string(), size)).boxed(), + ) + .await + } + + async fn delete_snapshots( + &self, + settings: &Settings, + snapshots: BoxStream<'_, (SnapshotId, u64)>, + ) -> StorageResult { + use super::SNAPSHOT_PREFIX; + use futures::StreamExt; + self.delete_objects( + settings, + SNAPSHOT_PREFIX, + snapshots.map(|(id, size)| (id.to_string(), size)).boxed(), + ) + .await + } + + async fn delete_transaction_logs( + &self, + settings: &Settings, + transaction_logs: BoxStream<'_, (SnapshotId, u64)>, + ) -> StorageResult { + use super::TRANSACTION_PREFIX; + use futures::StreamExt; + self.delete_objects( + settings, + TRANSACTION_PREFIX, + transaction_logs.map(|(id, size)| (id.to_string(), size)).boxed(), + ) + .await + } + + async fn delete_refs( + &self, + settings: &Settings, + refs: BoxStream<'_, String>, + ) -> StorageResult { + use super::REF_PREFIX; + use futures::StreamExt; + let refs = refs.map(|s| (s, 0)).boxed(); + Ok(self.delete_objects(settings, REF_PREFIX, refs).await?.deleted_objects) + } + + async fn get_object_range_buf( + &self, + settings: &Settings, + key: &str, + range: &Range, + ) -> StorageResult>; + + async fn get_object_range_read( + &self, + settings: &Settings, + key: &str, + range: &Range, + ) -> StorageResult>; + + async fn get_object_concurrently_multiple( + &self, + settings: &Settings, + key: &str, + parts: Vec>, + ) -> StorageResult> { + use bytes::Buf; + use futures::TryStreamExt; + use futures::stream::FuturesOrdered; + + let results = + parts + .into_iter() + .map(|range| async move { + self.get_object_range_buf(settings, key, &range).await + }) + .collect::>(); + + let init: Box = Box::new(&[][..]); + let buf = results + .try_fold(init, |prev, buf| async { + let res: Box = Box::new(prev.chain(buf)); + Ok(res) + }) + .await?; + + Ok(Box::new(buf)) + } + + async fn get_object_concurrently( + &self, + settings: &Settings, + key: &str, + range: &Range, + ) -> StorageResult { + use super::split_in_multiple_requests; + + let parts = split_in_multiple_requests( + range, + settings.concurrency().ideal_concurrent_request_size().get(), + settings.concurrency().max_concurrent_requests_for_object().get(), + ) + .collect::>(); + + let res = match parts.len() { + 0 => Reader::Asynchronous(Box::new(tokio::io::empty())), + 1 => Reader::Asynchronous( + self.get_object_range_read(settings, key, range).await?, + ), + _ => Reader::Synchronous( + self.get_object_concurrently_multiple(settings, key, parts).await?, + ), + }; + Ok(res) + } +} diff --git a/icechunk/src/storage/types.rs b/icechunk/src/storage/types.rs new file mode 100644 index 000000000..0584b462a --- /dev/null +++ b/icechunk/src/storage/types.rs @@ -0,0 +1,329 @@ +use bytes::{Buf, Bytes}; +use chrono::{DateTime, Utc}; +use serde::{Deserialize, Serialize}; +use std::{ + io::Read, + num::{NonZeroU16, NonZeroU64}, + sync::OnceLock, +}; +use tokio::io::AsyncRead; +#[cfg(not(target_arch = "wasm32"))] +use tokio_util::io::SyncIoBridge; + +use super::StorageResult; + +pub const SNAPSHOT_PREFIX: &str = "snapshots/"; +pub const MANIFEST_PREFIX: &str = "manifests/"; +pub const CHUNK_PREFIX: &str = "chunks/"; +pub const REF_PREFIX: &str = "refs"; +pub const TRANSACTION_PREFIX: &str = "transactions/"; +pub const CONFIG_PATH: &str = "config.yaml"; + +#[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Clone, Hash, PartialOrd, Ord)] +pub struct ETag(pub String); + +#[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Clone, Default)] +pub struct Generation(pub String); + +#[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Clone)] +pub struct VersionInfo { + pub etag: Option, + pub generation: Option, +} + +impl VersionInfo { + pub fn for_creation() -> Self { + Self { etag: None, generation: None } + } + + pub fn from_etag_only(etag: String) -> Self { + Self { etag: Some(ETag(etag)), generation: None } + } + + pub fn is_create(&self) -> bool { + self.etag.is_none() && self.generation.is_none() + } + + pub fn etag(&self) -> Option<&String> { + self.etag.as_ref().map(|e| &e.0) + } + + pub fn generation(&self) -> Option<&String> { + self.generation.as_ref().map(|e| &e.0) + } +} + +#[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Clone, Default)] +pub struct RetriesSettings { + pub max_tries: Option, + pub initial_backoff_ms: Option, + pub max_backoff_ms: Option, +} + +impl RetriesSettings { + pub fn max_tries(&self) -> NonZeroU16 { + self.max_tries.unwrap_or_else(|| NonZeroU16::new(10).unwrap_or(NonZeroU16::MIN)) + } + + pub fn initial_backoff_ms(&self) -> u32 { + self.initial_backoff_ms.unwrap_or(100) + } + + pub fn max_backoff_ms(&self) -> u32 { + self.max_backoff_ms.unwrap_or(3 * 60 * 1000) + } + + pub fn merge(&self, other: Self) -> Self { + Self { + max_tries: other.max_tries.or(self.max_tries), + initial_backoff_ms: other.initial_backoff_ms.or(self.initial_backoff_ms), + max_backoff_ms: other.max_backoff_ms.or(self.max_backoff_ms), + } + } +} + +#[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Clone, Default)] +pub struct ConcurrencySettings { + pub max_concurrent_requests_for_object: Option, + pub ideal_concurrent_request_size: Option, +} + +impl ConcurrencySettings { + // AWS recommendations: https://docs.aws.amazon.com/whitepapers/latest/s3-optimizing-performance-best-practices/horizontal-scaling-and-request-parallelization-for-high-throughput.html + // 8-16 MB requests + // 85-90 MB/s per request + // these numbers would saturate a 12.5 Gbps network + + pub fn max_concurrent_requests_for_object(&self) -> NonZeroU16 { + self.max_concurrent_requests_for_object + .unwrap_or_else(|| NonZeroU16::new(18).unwrap_or(NonZeroU16::MIN)) + } + pub fn ideal_concurrent_request_size(&self) -> NonZeroU64 { + self.ideal_concurrent_request_size.unwrap_or_else(|| { + NonZeroU64::new(12 * 1024 * 1024).unwrap_or(NonZeroU64::MIN) + }) + } + + pub fn merge(&self, other: Self) -> Self { + Self { + max_concurrent_requests_for_object: other + .max_concurrent_requests_for_object + .or(self.max_concurrent_requests_for_object), + ideal_concurrent_request_size: other + .ideal_concurrent_request_size + .or(self.ideal_concurrent_request_size), + } + } +} + +#[derive(Debug, PartialEq, Eq, Serialize, Deserialize, Clone, Default)] +pub struct Settings { + pub concurrency: Option, + pub retries: Option, + pub unsafe_use_conditional_update: Option, + pub unsafe_use_conditional_create: Option, + pub unsafe_use_metadata: Option, + #[serde(default)] + pub storage_class: Option, + #[serde(default)] + pub metadata_storage_class: Option, + #[serde(default)] + pub chunks_storage_class: Option, + #[serde(default)] + pub minimum_size_for_multipart_upload: Option, +} + +static DEFAULT_CONCURRENCY: OnceLock = OnceLock::new(); +static DEFAULT_RETRIES: OnceLock = OnceLock::new(); + +impl Settings { + pub fn concurrency(&self) -> &ConcurrencySettings { + self.concurrency + .as_ref() + .unwrap_or_else(|| DEFAULT_CONCURRENCY.get_or_init(Default::default)) + } + + pub fn retries(&self) -> &RetriesSettings { + self.retries + .as_ref() + .unwrap_or_else(|| DEFAULT_RETRIES.get_or_init(Default::default)) + } + + pub fn unsafe_use_conditional_create(&self) -> bool { + self.unsafe_use_conditional_create.unwrap_or(true) + } + + pub fn unsafe_use_conditional_update(&self) -> bool { + self.unsafe_use_conditional_update.unwrap_or(true) + } + + pub fn unsafe_use_metadata(&self) -> bool { + self.unsafe_use_metadata.unwrap_or(true) + } + + pub fn metadata_storage_class(&self) -> Option<&String> { + self.metadata_storage_class.as_ref().or(self.storage_class.as_ref()) + } + + pub fn chunks_storage_class(&self) -> Option<&String> { + self.chunks_storage_class.as_ref().or(self.storage_class.as_ref()) + } + + pub fn minimum_size_for_multipart_upload(&self) -> u64 { + // per AWS recommendation: 100 MB + self.minimum_size_for_multipart_upload.unwrap_or(100 * 1024 * 1024) + } + + pub fn merge(&self, other: Self) -> Self { + Self { + concurrency: match (&self.concurrency, other.concurrency) { + (None, None) => None, + (None, Some(c)) => Some(c), + (Some(c), None) => Some(c.clone()), + (Some(mine), Some(theirs)) => Some(mine.merge(theirs)), + }, + retries: match (&self.retries, other.retries) { + (None, None) => None, + (None, Some(c)) => Some(c), + (Some(c), None) => Some(c.clone()), + (Some(mine), Some(theirs)) => Some(mine.merge(theirs)), + }, + unsafe_use_conditional_create: match ( + &self.unsafe_use_conditional_create, + other.unsafe_use_conditional_create, + ) { + (None, None) => None, + (None, Some(c)) => Some(c), + (Some(c), None) => Some(*c), + (Some(_), Some(theirs)) => Some(theirs), + }, + unsafe_use_conditional_update: match ( + &self.unsafe_use_conditional_update, + other.unsafe_use_conditional_update, + ) { + (None, None) => None, + (None, Some(c)) => Some(c), + (Some(c), None) => Some(*c), + (Some(_), Some(theirs)) => Some(theirs), + }, + unsafe_use_metadata: match ( + &self.unsafe_use_metadata, + other.unsafe_use_metadata, + ) { + (None, None) => None, + (None, Some(c)) => Some(c), + (Some(c), None) => Some(*c), + (Some(_), Some(theirs)) => Some(theirs), + }, + storage_class: match (&self.storage_class, other.storage_class) { + (None, None) => None, + (None, Some(c)) => Some(c), + (Some(c), None) => Some(c.clone()), + (Some(_), Some(theirs)) => Some(theirs), + }, + metadata_storage_class: match ( + &self.metadata_storage_class, + other.metadata_storage_class, + ) { + (None, None) => None, + (None, Some(c)) => Some(c), + (Some(c), None) => Some(c.clone()), + (Some(_), Some(theirs)) => Some(theirs), + }, + chunks_storage_class: match ( + &self.chunks_storage_class, + other.chunks_storage_class, + ) { + (None, None) => None, + (None, Some(c)) => Some(c), + (Some(c), None) => Some(c.clone()), + (Some(_), Some(theirs)) => Some(theirs), + }, + minimum_size_for_multipart_upload: match ( + &self.minimum_size_for_multipart_upload, + other.minimum_size_for_multipart_upload, + ) { + (None, None) => None, + (None, Some(c)) => Some(c), + (Some(c), None) => Some(*c), + (Some(_), Some(theirs)) => Some(theirs), + }, + } + } +} + +pub enum Reader { + Asynchronous(Box), + Synchronous(Box), +} + +impl Reader { + pub async fn to_bytes(self, expected_size: usize) -> StorageResult { + match self { + Reader::Asynchronous(mut read) => { + // add some extra space to the buffer to optimize conversion to bytes + let mut buffer = Vec::with_capacity(expected_size + 16); + tokio::io::copy(&mut read, &mut buffer) + .await + .map_err(super::StorageErrorKind::IOError)?; + Ok(buffer.into()) + } + Reader::Synchronous(mut buf) => Ok(buf.copy_to_bytes(buf.remaining())), + } + } + + /// Notice this Read can only be used in non async contexts, for example, calling tokio::task::spawn_blocking + pub fn into_read(self) -> Box { + match self { + #[cfg(not(target_arch = "wasm32"))] + Reader::Asynchronous(read) => Box::new(SyncIoBridge::new(read)), + #[cfg(target_arch = "wasm32")] + Reader::Asynchronous(_) => panic!("SyncIoBridge not available on WASM"), + Reader::Synchronous(buf) => Box::new(buf.reader()), + } + } +} + +#[derive(Debug)] +pub struct ListInfo { + pub id: Id, + pub created_at: DateTime, + pub size_bytes: u64, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum FetchConfigResult { + Found { bytes: Bytes, version: VersionInfo }, + NotFound, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum UpdateConfigResult { + Updated { new_version: VersionInfo }, + NotOnLatestVersion, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum GetRefResult { + Found { bytes: Bytes, version: VersionInfo }, + NotFound, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum WriteRefResult { + Written, + WontOverwrite, +} + +#[derive(Debug, Clone, PartialEq, Eq, Default)] +pub struct DeleteObjectsResult { + pub deleted_objects: u64, + pub deleted_bytes: u64, +} + +impl DeleteObjectsResult { + pub fn merge(&mut self, other: &Self) { + self.deleted_objects += other.deleted_objects; + self.deleted_bytes += other.deleted_bytes; + } +} diff --git a/icechunk/src/virtual_chunks.rs b/icechunk/src/virtual_chunks.rs index 2a463d79c..e6b716ce2 100644 --- a/icechunk/src/virtual_chunks.rs +++ b/icechunk/src/virtual_chunks.rs @@ -2,40 +2,49 @@ use std::{ collections::HashMap, num::{NonZeroU16, NonZeroU64}, ops::Range, - str::FromStr, sync::Arc, }; use async_trait::async_trait; -use aws_sdk_s3::{Client, error::SdkError, operation::get_object::GetObjectError}; use bytes::{Buf, Bytes}; -use futures::{TryStreamExt, stream::FuturesOrdered}; -use object_store::{ - ClientConfigKey, GetOptions, ObjectStore, gcp::GoogleConfigKey, - local::LocalFileSystem, path::Path, -}; use quick_cache::sync::Cache; use serde::{Deserialize, Serialize}; use url::Url; +#[cfg(not(target_arch = "wasm32"))] +use { + aws_sdk_s3::{Client, error::SdkError, operation::get_object::GetObjectError}, + futures::{TryStreamExt, stream::FuturesOrdered}, + object_store::{ + ClientConfigKey, GetOptions, ObjectStore, gcp::GoogleConfigKey, + local::LocalFileSystem, path::Path, + }, + std::str::FromStr, +}; + +#[cfg(target_arch = "wasm32")] +use futures::{TryStreamExt, stream::FuturesOrdered}; + use crate::{ ObjectStoreConfig, config::{Credentials, GcsCredentials, S3Credentials, S3Options}, format::{ ChunkOffset, - manifest::{ - Checksum, SecondsSinceEpoch, VirtualReferenceError, VirtualReferenceErrorKind, - }, + manifest::{Checksum, VirtualReferenceError, VirtualReferenceErrorKind}, }, private, - storage::{ - self, - object_store::{ - GcsObjectStoreBackend, HttpObjectStoreBackend, ObjectStoreBackend as _, - }, - s3::{mk_client, range_to_header}, - split_in_multiple_requests, + storage::{self, split_in_multiple_requests}, +}; + +#[cfg(not(target_arch = "wasm32"))] +use crate::format::manifest::SecondsSinceEpoch; + +#[cfg(not(target_arch = "wasm32"))] +use crate::storage::implementations::{ + object_store::{ + GcsObjectStoreBackend, HttpObjectStoreBackend, ObjectStoreBackend as _, }, + s3::{mk_client, range_to_header}, }; pub type ContainerName = String; @@ -256,7 +265,7 @@ impl VirtualChunkResolver { settings: storage::Settings, ) -> Self { fn add_trailing(s: String) -> String { - if s.ends_with('/') { s } else { format!("{s}/") } + if s.ends_with('/') { s } else { format!("{}/", s) } } // we need to validate the containers because they can come from persisted config @@ -487,12 +496,14 @@ fn fetcher_cache_key( } } +#[cfg(not(target_arch = "wasm32"))] #[derive(Debug)] pub struct S3Fetcher { client: Arc, settings: storage::Settings, } +#[cfg(not(target_arch = "wasm32"))] impl S3Fetcher { pub async fn new( opts: &S3Options, @@ -505,8 +516,10 @@ impl S3Fetcher { } } +#[cfg(not(target_arch = "wasm32"))] impl private::Sealed for S3Fetcher {} +#[cfg(not(target_arch = "wasm32"))] #[async_trait] impl ChunkFetcher for S3Fetcher { fn ideal_concurrent_request_size(&self) -> NonZeroU64 { @@ -600,13 +613,17 @@ impl ChunkFetcher for S3Fetcher { } } +#[cfg(not(target_arch = "wasm32"))] #[derive(Debug)] pub struct ObjectStoreFetcher { client: Arc, settings: storage::Settings, } + +#[cfg(not(target_arch = "wasm32"))] impl private::Sealed for ObjectStoreFetcher {} +#[cfg(not(target_arch = "wasm32"))] impl ObjectStoreFetcher { fn new_local() -> Self { ObjectStoreFetcher { @@ -669,6 +686,7 @@ impl ObjectStoreFetcher { } } +#[cfg(not(target_arch = "wasm32"))] #[async_trait] impl ChunkFetcher for ObjectStoreFetcher { fn ideal_concurrent_request_size(&self) -> NonZeroU64 { @@ -721,6 +739,117 @@ impl ChunkFetcher for ObjectStoreFetcher { } } +// WASM-compatible stubs +#[cfg(target_arch = "wasm32")] +#[derive(Debug)] +pub struct S3Fetcher; + +#[cfg(target_arch = "wasm32")] +impl S3Fetcher { + pub async fn new( + _opts: &crate::config::S3Options, + _credentials: &crate::config::S3Credentials, + _settings: storage::Settings, + ) -> Self { + Self + } +} + +#[cfg(target_arch = "wasm32")] +impl private::Sealed for S3Fetcher {} + +#[cfg(target_arch = "wasm32")] +#[async_trait] +impl ChunkFetcher for S3Fetcher { + fn ideal_concurrent_request_size(&self) -> NonZeroU64 { + NonZeroU64::new(1024).unwrap_or(NonZeroU64::MIN) + } + + fn max_concurrent_requests_for_object(&self) -> NonZeroU16 { + NonZeroU16::new(1).unwrap_or(NonZeroU16::MIN) + } + + async fn fetch_part( + &self, + _chunk_location: &Url, + _range: Range, + _checksum: Option<&Checksum>, + ) -> Result, VirtualReferenceError> { + Err(VirtualReferenceError::from(VirtualReferenceErrorKind::OtherError(Box::new( + std::io::Error::new( + std::io::ErrorKind::Unsupported, + "S3Fetcher not supported on WASM", + ), + )))) + } +} + +#[cfg(target_arch = "wasm32")] +#[derive(Debug)] +pub struct ObjectStoreFetcher; + +#[cfg(target_arch = "wasm32")] +impl ObjectStoreFetcher { + fn new_local() -> Self { + Self + } + + pub async fn new_http( + _url: &str, + _opts: &HashMap, + ) -> Result { + Err(VirtualReferenceError::from(VirtualReferenceErrorKind::OtherError(Box::new( + std::io::Error::new( + std::io::ErrorKind::Unsupported, + "ObjectStoreFetcher::new_http not supported on WASM", + ), + )))) + } + + pub async fn new_gcs( + _bucket: String, + _prefix: Option, + _credentials: Option, + _config: HashMap, + ) -> Result { + Err(VirtualReferenceError::from(VirtualReferenceErrorKind::OtherError(Box::new( + std::io::Error::new( + std::io::ErrorKind::Unsupported, + "ObjectStoreFetcher::new_gcs not supported on WASM", + ), + )))) + } +} + +#[cfg(target_arch = "wasm32")] +impl private::Sealed for ObjectStoreFetcher {} + +#[cfg(target_arch = "wasm32")] +#[async_trait] +impl ChunkFetcher for ObjectStoreFetcher { + fn ideal_concurrent_request_size(&self) -> NonZeroU64 { + NonZeroU64::new(1024).unwrap_or(NonZeroU64::MIN) + } + + fn max_concurrent_requests_for_object(&self) -> NonZeroU16 { + NonZeroU16::new(1).unwrap_or(NonZeroU16::MIN) + } + + async fn fetch_part( + &self, + _chunk_location: &Url, + _range: Range, + _checksum: Option<&Checksum>, + ) -> Result, VirtualReferenceError> { + Err(VirtualReferenceError::from(VirtualReferenceErrorKind::OtherError(Box::new( + std::io::Error::new( + std::io::ErrorKind::Unsupported, + "ObjectStoreFetcher not supported on WASM", + ), + )))) + } +} + #[cfg(test)] #[allow(clippy::panic, clippy::unwrap_used, clippy::expect_used)] mod tests { From 8f3c77fa62fe888818dafde92182fa42897dd477 Mon Sep 17 00:00:00 2001 From: Matthew Iannucci Date: Thu, 7 Aug 2025 15:18:05 -0400 Subject: [PATCH 2/2] More cleanup --- .../{implementations => backends}/mod.rs | 0 .../object_store.rs | 0 .../{implementations => backends}/s3.rs | 0 icechunk/src/storage/mod.rs | 28 +- icechunk/src/storage/object_store.rs | 1297 ----------------- icechunk/src/storage/s3.rs | 1144 --------------- icechunk/src/virtual_chunks.rs | 2 +- icechunk/tests/test_storage.rs | 2 +- icechunk/tests/test_virtual_refs.rs | 3 +- 9 files changed, 16 insertions(+), 2460 deletions(-) rename icechunk/src/storage/{implementations => backends}/mod.rs (100%) rename icechunk/src/storage/{implementations => backends}/object_store.rs (100%) rename icechunk/src/storage/{implementations => backends}/s3.rs (100%) delete mode 100644 icechunk/src/storage/object_store.rs delete mode 100644 icechunk/src/storage/s3.rs diff --git a/icechunk/src/storage/implementations/mod.rs b/icechunk/src/storage/backends/mod.rs similarity index 100% rename from icechunk/src/storage/implementations/mod.rs rename to icechunk/src/storage/backends/mod.rs diff --git a/icechunk/src/storage/implementations/object_store.rs b/icechunk/src/storage/backends/object_store.rs similarity index 100% rename from icechunk/src/storage/implementations/object_store.rs rename to icechunk/src/storage/backends/object_store.rs diff --git a/icechunk/src/storage/implementations/s3.rs b/icechunk/src/storage/backends/s3.rs similarity index 100% rename from icechunk/src/storage/implementations/s3.rs rename to icechunk/src/storage/backends/s3.rs diff --git a/icechunk/src/storage/mod.rs b/icechunk/src/storage/mod.rs index 64e69014a..3d86e905f 100644 --- a/icechunk/src/storage/mod.rs +++ b/icechunk/src/storage/mod.rs @@ -16,7 +16,7 @@ pub mod traits; pub mod types; // Implementation modules - conditionally compiled -pub mod implementations; +pub mod backends; // Test module #[cfg(test)] @@ -29,7 +29,7 @@ pub use types::*; // Conditional re-exports for implementations #[cfg(not(target_arch = "wasm32"))] -pub use implementations::{ObjectStorage, S3Storage}; +pub use backends::{ObjectStorage, S3Storage}; /// Split an object request into multiple byte range requests /// @@ -132,7 +132,7 @@ pub fn new_s3_storage( prefix: Option, credentials: Option, ) -> StorageResult> { - use implementations::s3::S3Storage; + use backends::s3::S3Storage; if let Some(endpoint) = &config.endpoint_url { if endpoint.contains("fly.storage.tigris.dev") { @@ -160,7 +160,7 @@ pub fn new_r2_storage( account_id: Option, credentials: Option, ) -> StorageResult> { - use implementations::s3::S3Storage; + use backends::s3::S3Storage; let (bucket, prefix) = match (bucket, prefix) { (Some(bucket), Some(prefix)) => (bucket, Some(prefix)), @@ -212,7 +212,7 @@ pub fn new_tigris_storage( credentials: Option, use_weak_consistency: bool, ) -> StorageResult> { - use implementations::s3::S3Storage; + use backends::s3::S3Storage; let config = S3Options { endpoint_url: Some( @@ -252,7 +252,7 @@ pub fn new_tigris_storage( pub async fn new_in_memory_storage() -> StorageResult> { #[cfg(not(target_arch = "wasm32"))] { - let st = implementations::ObjectStorage::new_in_memory().await?; + let st = backends::ObjectStorage::new_in_memory().await?; Ok(Arc::new(st)) } #[cfg(target_arch = "wasm32")] @@ -269,7 +269,7 @@ pub async fn new_in_memory_storage() -> StorageResult> { pub async fn new_local_filesystem_storage( path: &std::path::Path, ) -> StorageResult> { - let st = implementations::ObjectStorage::new_local_filesystem(path).await?; + let st = backends::ObjectStorage::new_local_filesystem(path).await?; Ok(Arc::new(st)) } @@ -286,7 +286,7 @@ pub async fn new_s3_object_store_storage( } } let storage = - implementations::ObjectStorage::new_s3(bucket, prefix, credentials, Some(config)) + backends::ObjectStorage::new_s3(bucket, prefix, credentials, Some(config)) .await?; Ok(Arc::new(storage)) } @@ -306,7 +306,7 @@ pub async fn new_azure_blob_storage( .into_iter() .filter_map(|(key, value)| key.parse::().map(|k| (k, value)).ok()) .collect(); - let storage = implementations::ObjectStorage::new_azure( + let storage = backends::ObjectStorage::new_azure( account, container, prefix, @@ -333,13 +333,9 @@ pub async fn new_gcs_storage( key.parse::().map(|k| (k, value)).ok() }) .collect(); - let storage = implementations::ObjectStorage::new_gcs( - bucket, - prefix, - credentials, - Some(config), - ) - .await?; + let storage = + backends::ObjectStorage::new_gcs(bucket, prefix, credentials, Some(config)) + .await?; Ok(Arc::new(storage)) } diff --git a/icechunk/src/storage/object_store.rs b/icechunk/src/storage/object_store.rs deleted file mode 100644 index 7a5033979..000000000 --- a/icechunk/src/storage/object_store.rs +++ /dev/null @@ -1,1297 +0,0 @@ -use crate::{ - config::{ - AzureCredentials, AzureStaticCredentials, GcsBearerCredential, GcsCredentials, - GcsCredentialsFetcher, GcsStaticCredentials, S3Credentials, S3Options, - }, - format::{ChunkId, ChunkOffset, FileTypeTag, ManifestId, ObjectId, SnapshotId}, - private, -}; -use async_trait::async_trait; -use bytes::{Buf, Bytes}; -use chrono::{DateTime, TimeDelta, Utc}; -use futures::{ - StreamExt, TryStreamExt, - stream::{self, BoxStream}, -}; -use object_store::{ - Attribute, AttributeValue, Attributes, BackoffConfig, ClientConfigKey, - CredentialProvider, GetOptions, ObjectMeta, ObjectStore, PutMode, PutOptions, - PutPayload, RetryConfig, StaticCredentialProvider, UpdateVersion, - aws::AmazonS3Builder, - azure::{AzureConfigKey, MicrosoftAzureBuilder}, - gcp::{GcpCredential, GoogleCloudStorageBuilder, GoogleConfigKey}, - http::HttpBuilder, - local::LocalFileSystem, - memory::InMemory, - path::Path as ObjectPath, -}; -use serde::{Deserialize, Serialize}; -use std::{ - collections::HashMap, - fmt::{self, Debug, Display}, - fs::create_dir_all, - future::ready, - num::{NonZeroU16, NonZeroU64}, - ops::Range, - path::{Path as StdPath, PathBuf}, - sync::Arc, -}; -use tokio::{ - io::AsyncRead, - sync::{OnceCell, RwLock}, -}; -use tokio_util::compat::FuturesAsyncReadCompatExt; -use tracing::instrument; - -use super::{ - CHUNK_PREFIX, CONFIG_PATH, ConcurrencySettings, DeleteObjectsResult, ETag, - FetchConfigResult, Generation, GetRefResult, ListInfo, MANIFEST_PREFIX, REF_PREFIX, - Reader, RetriesSettings, SNAPSHOT_PREFIX, Settings, Storage, StorageError, - StorageErrorKind, StorageResult, TRANSACTION_PREFIX, UpdateConfigResult, VersionInfo, - WriteRefResult, -}; - -#[derive(Debug, Serialize, Deserialize)] -pub struct ObjectStorage { - backend: Arc, - #[serde(skip)] - /// We need to use OnceCell to allow async initialization, because serde - /// does not support async cfunction calls from deserialization. This gives - /// us a way to lazily initialize the client. - client: OnceCell>, -} - -impl ObjectStorage { - /// Create an in memory Storage implementation - /// - /// This implementation should not be used in production code. - pub async fn new_in_memory() -> Result { - let backend = Arc::new(InMemoryObjectStoreBackend); - let storage = ObjectStorage { backend, client: OnceCell::new() }; - Ok(storage) - } - - /// Create an local filesystem Storage implementation - /// - /// This implementation should not be used in production code. - pub async fn new_local_filesystem( - prefix: &StdPath, - ) -> Result { - tracing::warn!( - "The LocalFileSystem storage is not safe for concurrent commits. If more than one thread/process will attempt to commit at the same time, prefer using object stores." - ); - let backend = - Arc::new(LocalFileSystemObjectStoreBackend { path: prefix.to_path_buf() }); - let storage = ObjectStorage { backend, client: OnceCell::new() }; - Ok(storage) - } - - pub async fn new_s3( - bucket: String, - prefix: Option, - credentials: Option, - config: Option, - ) -> Result { - let backend = - Arc::new(S3ObjectStoreBackend { bucket, prefix, credentials, config }); - let storage = ObjectStorage { backend, client: OnceCell::new() }; - - Ok(storage) - } - - pub async fn new_azure( - account: String, - container: String, - prefix: Option, - credentials: Option, - config: Option>, - ) -> Result { - let backend = Arc::new(AzureObjectStoreBackend { - account, - container, - prefix, - credentials, - config, - }); - let storage = ObjectStorage { backend, client: OnceCell::new() }; - - Ok(storage) - } - - pub async fn new_gcs( - bucket: String, - prefix: Option, - credentials: Option, - config: Option>, - ) -> Result { - let backend = - Arc::new(GcsObjectStoreBackend { bucket, prefix, credentials, config }); - let storage = ObjectStorage { backend, client: OnceCell::new() }; - - Ok(storage) - } - - /// Get the client, initializing it if it hasn't been initialized yet. This is necessary because the - /// client is not serializeable and must be initialized after deserialization. Under normal construction - /// the original client is returned immediately. - #[instrument(skip_all)] - async fn get_client(&self, settings: &Settings) -> &Arc { - self.client - .get_or_init(|| async { - // TODO: handle error better? - #[allow(clippy::expect_used)] - self.backend - .mk_object_store(settings) - .expect("failed to create object store") - }) - .await - } - - /// We need this because object_store's local file implementation doesn't sort refs. Since this - /// implementation is used only for tests, it's OK to sort in memory. - pub fn artificially_sort_refs_in_mem(&self) -> bool { - self.backend.artificially_sort_refs_in_mem() - } - - /// Return all keys in the store - /// - /// Intended for testing and debugging purposes only. - pub async fn all_keys(&self) -> StorageResult> { - Ok(self - .get_client(&self.backend.default_settings()) - .await - .list(None) - .map_ok(|obj| obj.location.to_string()) - .try_collect() - .await - .map_err(Box::new)?) - } - - fn get_path_str(&self, file_prefix: &str, id: &str) -> ObjectPath { - let path = format!("{}/{}/{}", self.backend.prefix(), file_prefix, id); - ObjectPath::from(path) - } - - fn get_path( - &self, - file_prefix: &str, - id: &ObjectId, - ) -> ObjectPath { - // we serialize the url using crockford - self.get_path_str(file_prefix, id.to_string().as_str()) - } - - fn get_config_path(&self) -> ObjectPath { - self.get_path_str("", CONFIG_PATH) - } - - fn get_snapshot_path(&self, id: &SnapshotId) -> ObjectPath { - self.get_path(SNAPSHOT_PREFIX, id) - } - - fn get_manifest_path(&self, id: &ManifestId) -> ObjectPath { - self.get_path(MANIFEST_PREFIX, id) - } - - fn get_transaction_path(&self, id: &SnapshotId) -> ObjectPath { - self.get_path(TRANSACTION_PREFIX, id) - } - - fn get_chunk_path(&self, id: &ChunkId) -> ObjectPath { - self.get_path(CHUNK_PREFIX, id) - } - - fn drop_prefix(&self, prefix: &ObjectPath, path: &ObjectPath) -> Option { - path.prefix_match(&ObjectPath::from(format!("{prefix}"))).map(|it| it.collect()) - } - - fn ref_key(&self, ref_key: &str) -> ObjectPath { - // ObjectPath knows how to deal with empty path parts: bar//foo - ObjectPath::from(format!("{}/{}/{}", self.backend.prefix(), REF_PREFIX, ref_key)) - } - - async fn get_object_reader( - &self, - settings: &Settings, - path: &ObjectPath, - ) -> StorageResult> { - Ok(self - .get_client(settings) - .await - .get(path) - .await - .map_err(Box::new)? - .into_stream() - .err_into() - .into_async_read() - .compat()) - } - - fn metadata_to_attributes( - &self, - settings: &Settings, - metadata: Vec<(String, String)>, - ) -> Attributes { - if settings.unsafe_use_metadata() { - Attributes::from_iter(metadata.into_iter().map(|(key, val)| { - ( - Attribute::Metadata(std::borrow::Cow::Owned(key)), - AttributeValue::from(val), - ) - })) - } else { - Attributes::new() - } - } - - fn get_ref_name(&self, prefix: &ObjectPath, meta: &ObjectMeta) -> Option { - let relative_key = self.drop_prefix(prefix, &meta.location)?; - let parent = relative_key.parts().next()?; - Some(parent.as_ref().to_string()) - } - - fn get_put_mode( - &self, - settings: &Settings, - previous_version: &VersionInfo, - ) -> PutMode { - match ( - previous_version.is_create(), - settings.unsafe_use_conditional_create(), - settings.unsafe_use_conditional_update(), - ) { - (true, true, _) => PutMode::Create, - (true, false, _) => PutMode::Overwrite, - - (false, _, true) => PutMode::Update(UpdateVersion { - e_tag: previous_version.etag().cloned(), - version: previous_version.generation().cloned(), - }), - (false, _, false) => PutMode::Overwrite, - } - } -} - -impl fmt::Display for ObjectStorage { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "ObjectStorage(backend={})", self.backend) - } -} - -impl private::Sealed for ObjectStorage {} - -#[async_trait] -#[typetag::serde] -impl Storage for ObjectStorage { - fn can_write(&self) -> bool { - self.backend.can_write() - } - - #[instrument(skip_all)] - fn default_settings(&self) -> Settings { - self.backend.default_settings() - } - - #[instrument(skip_all)] - async fn fetch_config( - &self, - settings: &Settings, - ) -> StorageResult { - let path = self.get_config_path(); - let response = self.get_client(settings).await.get(&path).await; - - match response { - Ok(result) => { - let version = VersionInfo { - etag: result.meta.e_tag.as_ref().cloned().map(ETag), - generation: result.meta.version.as_ref().cloned().map(Generation), - }; - - Ok(FetchConfigResult::Found { - bytes: result.bytes().await.map_err(Box::new)?, - version, - }) - } - Err(object_store::Error::NotFound { .. }) => Ok(FetchConfigResult::NotFound), - Err(err) => Err(Box::new(err).into()), - } - } - #[instrument(skip(self, settings, config))] - async fn update_config( - &self, - settings: &Settings, - config: Bytes, - previous_version: &VersionInfo, - ) -> StorageResult { - let path = self.get_config_path(); - let attributes = if settings.unsafe_use_metadata() { - Attributes::from_iter(vec![( - Attribute::ContentType, - AttributeValue::from("application/yaml"), - )]) - } else { - Attributes::new() - }; - - let mode = self.get_put_mode(settings, previous_version); - - let options = PutOptions { mode, attributes, ..PutOptions::default() }; - let res = - self.get_client(settings).await.put_opts(&path, config.into(), options).await; - match res { - Ok(res) => { - let new_version = VersionInfo { - etag: res.e_tag.map(ETag), - generation: res.version.map(Generation), - }; - Ok(UpdateConfigResult::Updated { new_version }) - } - Err(object_store::Error::Precondition { .. }) => { - Ok(UpdateConfigResult::NotOnLatestVersion) - } - Err(err) => Err(Box::new(err).into()), - } - } - - #[instrument(skip(self, settings))] - async fn fetch_snapshot( - &self, - settings: &Settings, - id: &SnapshotId, - ) -> StorageResult> { - let path = self.get_snapshot_path(id); - Ok(Box::new(self.get_object_reader(settings, &path).await?)) - } - - #[instrument(skip(self, settings))] - async fn fetch_manifest_known_size( - &self, - settings: &Settings, - id: &ManifestId, - size: u64, - ) -> StorageResult { - let path = self.get_manifest_path(id); - self.get_object_concurrently(settings, path.as_ref(), &(0..size)).await - } - - #[instrument(skip(self, settings))] - async fn fetch_manifest_unknown_size( - &self, - settings: &Settings, - id: &ManifestId, - ) -> StorageResult> { - let path = self.get_manifest_path(id); - Ok(Box::new(self.get_object_reader(settings, &path).await?)) - } - - #[instrument(skip(self, settings))] - async fn fetch_transaction_log( - &self, - settings: &Settings, - id: &SnapshotId, - ) -> StorageResult> { - let path = self.get_transaction_path(id); - Ok(Box::new(self.get_object_reader(settings, &path).await?)) - } - - #[instrument(skip(self, settings, metadata, bytes))] - async fn write_snapshot( - &self, - settings: &Settings, - id: SnapshotId, - metadata: Vec<(String, String)>, - bytes: Bytes, - ) -> StorageResult<()> { - let path = self.get_snapshot_path(&id); - let attributes = self.metadata_to_attributes(settings, metadata); - let options = PutOptions { attributes, ..PutOptions::default() }; - // FIXME: use multipart - self.get_client(settings) - .await - .put_opts(&path, bytes.into(), options) - .await - .map_err(Box::new)?; - Ok(()) - } - - #[instrument(skip(self, settings, metadata, bytes))] - async fn write_manifest( - &self, - settings: &Settings, - id: ManifestId, - metadata: Vec<(String, String)>, - bytes: Bytes, - ) -> StorageResult<()> { - let path = self.get_manifest_path(&id); - let attributes = self.metadata_to_attributes(settings, metadata); - let options = PutOptions { attributes, ..PutOptions::default() }; - // FIXME: use multipart - self.get_client(settings) - .await - .put_opts(&path, bytes.into(), options) - .await - .map_err(Box::new)?; - Ok(()) - } - - #[instrument(skip(self, settings, metadata, bytes))] - async fn write_transaction_log( - &self, - settings: &Settings, - id: SnapshotId, - metadata: Vec<(String, String)>, - bytes: Bytes, - ) -> StorageResult<()> { - let path = self.get_transaction_path(&id); - let attributes = self.metadata_to_attributes(settings, metadata); - let options = PutOptions { attributes, ..PutOptions::default() }; - // FIXME: use multipart - self.get_client(settings) - .await - .put_opts(&path, bytes.into(), options) - .await - .map_err(Box::new)?; - Ok(()) - } - - #[instrument(skip(self, settings))] - async fn fetch_chunk( - &self, - settings: &Settings, - id: &ChunkId, - range: &Range, - ) -> Result { - let path = self.get_chunk_path(id); - self.get_object_concurrently(settings, path.as_ref(), range) - .await? - .to_bytes((range.end - range.start + 16) as usize) - .await - } - - #[instrument(skip(self, settings, bytes))] - async fn write_chunk( - &self, - settings: &Settings, - id: ChunkId, - bytes: Bytes, - ) -> Result<(), StorageError> { - let path = self.get_chunk_path(&id); - self.get_client(settings) - .await - .put(&path, bytes.into()) - .await - .map_err(Box::new)?; - Ok(()) - } - - #[instrument(skip(self, settings))] - async fn get_ref( - &self, - settings: &Settings, - ref_key: &str, - ) -> StorageResult { - let key = self.ref_key(ref_key); - match self.get_client(settings).await.get(&key).await { - Ok(res) => { - let etag = res.meta.e_tag.clone().map(ETag); - let generation = res.meta.version.clone().map(Generation); - Ok(GetRefResult::Found { - bytes: res.bytes().await.map_err(Box::new)?, - version: VersionInfo { etag, generation }, - }) - } - Err(object_store::Error::NotFound { .. }) => Ok(GetRefResult::NotFound), - Err(err) => Err(Box::new(err).into()), - } - } - - #[instrument(skip(self, settings))] - async fn ref_names(&self, settings: &Settings) -> StorageResult> { - let prefix = &self.ref_key(""); - - Ok(self - .get_client(settings) - .await - .list(Some(prefix.clone()).as_ref()) - .try_filter_map(|meta| async move { - let name = self.get_ref_name(prefix, &meta); - if name.is_none() { - tracing::error!(object = ?meta, "Bad ref name") - } - Ok(name) - }) - .try_collect() - .await - .map_err(Box::new)?) - } - - #[instrument(skip(self, settings, bytes))] - async fn write_ref( - &self, - settings: &Settings, - ref_key: &str, - bytes: Bytes, - previous_version: &VersionInfo, - ) -> StorageResult { - let key = self.ref_key(ref_key); - let mode = self.get_put_mode(settings, previous_version); - let opts = PutOptions { mode, ..PutOptions::default() }; - - match self - .get_client(settings) - .await - .put_opts(&key, PutPayload::from_bytes(bytes), opts) - .await - { - Ok(_) => Ok(WriteRefResult::Written), - Err(object_store::Error::Precondition { .. }) - | Err(object_store::Error::AlreadyExists { .. }) => { - Ok(WriteRefResult::WontOverwrite) - } - Err(err) => Err(Box::new(err).into()), - } - } - - #[instrument(skip(self, settings))] - async fn list_objects<'a>( - &'a self, - settings: &Settings, - prefix: &str, - ) -> StorageResult>>> { - let prefix = ObjectPath::from(format!("{}/{}", self.backend.prefix(), prefix)); - let stream = self - .get_client(settings) - .await - .list(Some(&prefix)) - // TODO: we should signal error instead of filtering - .try_filter_map(|object| async move { - let info = object_to_list_info(&object); - if info.is_none() { - tracing::error!(object=?object, "Found bad object while listing"); - } - Ok(info) - }) - .map_err(Box::new) - .err_into(); - Ok(stream.boxed()) - } - - #[instrument(skip(self, batch))] - async fn delete_batch( - &self, - settings: &Settings, - prefix: &str, - batch: Vec<(String, u64)>, - ) -> StorageResult { - let mut sizes = HashMap::new(); - let mut ids = Vec::new(); - for (id, size) in batch { - let path = self.get_path_str(prefix, id.as_str()); - ids.push(Ok(path.clone())); - sizes.insert(path, size); - } - let results = - self.get_client(settings).await.delete_stream(stream::iter(ids).boxed()); - let res = results - .fold(DeleteObjectsResult::default(), |mut res, delete_result| { - if let Ok(deleted_path) = delete_result { - if let Some(size) = sizes.get(&deleted_path) { - res.deleted_objects += 1; - res.deleted_bytes += *size; - } - } else { - tracing::error!( - error = ?delete_result, - "Error deleting object", - ); - } - ready(res) - }) - .await; - Ok(res) - } - - #[instrument(skip(self, settings))] - async fn get_snapshot_last_modified( - &self, - settings: &Settings, - snapshot: &SnapshotId, - ) -> StorageResult> { - let path = self.get_snapshot_path(snapshot); - let res = self.get_client(settings).await.head(&path).await.map_err(Box::new)?; - Ok(res.last_modified) - } - - #[instrument(skip(self))] - async fn get_object_range_buf( - &self, - settings: &Settings, - key: &str, - range: &Range, - ) -> StorageResult> { - let path = ObjectPath::from(key); - let usize_range = range.start..range.end; - let range = Some(usize_range.into()); - let opts = GetOptions { range, ..Default::default() }; - Ok(Box::new( - self.get_client(settings) - .await - .get_opts(&path, opts) - .await - .map_err(Box::new)? - .bytes() - .await - .map_err(Box::new)?, - )) - } - - #[instrument(skip(self))] - async fn get_object_range_read( - &self, - settings: &Settings, - key: &str, - range: &Range, - ) -> StorageResult> { - let path = ObjectPath::from(key); - let usize_range = range.start..range.end; - let range = Some(usize_range.into()); - let opts = GetOptions { range, ..Default::default() }; - let res: Box = Box::new( - self.get_client(settings) - .await - .get_opts(&path, opts) - .await - .map_err(Box::new)? - .into_stream() - .err_into() - .into_async_read() - .compat(), - ); - Ok(res) - } -} - -#[typetag::serde(tag = "object_store_provider_type")] -pub trait ObjectStoreBackend: Debug + Display + Sync + Send { - fn mk_object_store( - &self, - settings: &Settings, - ) -> Result, StorageError>; - - /// The prefix for the object store. - fn prefix(&self) -> String; - - /// We need this because object_store's local file implementation doesn't sort refs. Since this - /// implementation is used only for tests, it's OK to sort in memory. - fn artificially_sort_refs_in_mem(&self) -> bool { - false - } - - fn default_settings(&self) -> Settings; - - fn can_write(&self) -> bool { - true - } -} - -#[derive(Debug, Serialize, Deserialize)] -pub struct InMemoryObjectStoreBackend; - -impl fmt::Display for InMemoryObjectStoreBackend { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "InMemoryObjectStoreBackend") - } -} - -#[typetag::serde(name = "in_memory_object_store_provider")] -impl ObjectStoreBackend for InMemoryObjectStoreBackend { - fn mk_object_store( - &self, - _settings: &Settings, - ) -> Result, StorageError> { - Ok(Arc::new(InMemory::new())) - } - - fn prefix(&self) -> String { - "".to_string() - } - - fn default_settings(&self) -> Settings { - Settings { - concurrency: Some(ConcurrencySettings { - // we do != 1 because we use this store for tests - max_concurrent_requests_for_object: Some( - NonZeroU16::new(5).unwrap_or(NonZeroU16::MIN), - ), - ideal_concurrent_request_size: Some( - NonZeroU64::new(1).unwrap_or(NonZeroU64::MIN), - ), - }), - retries: Some(RetriesSettings { - max_tries: Some(NonZeroU16::MIN), - initial_backoff_ms: Some(0), - max_backoff_ms: Some(0), - }), - - ..Default::default() - } - } -} - -#[derive(Debug, Serialize, Deserialize)] -pub struct LocalFileSystemObjectStoreBackend { - path: PathBuf, -} - -impl fmt::Display for LocalFileSystemObjectStoreBackend { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "LocalFileSystemObjectStoreBackend(path={})", self.path.display()) - } -} - -#[typetag::serde(name = "local_file_system_object_store_provider")] -impl ObjectStoreBackend for LocalFileSystemObjectStoreBackend { - fn mk_object_store( - &self, - _settings: &Settings, - ) -> Result, StorageError> { - create_dir_all(&self.path).map_err(|e| StorageErrorKind::Other(e.to_string()))?; - - let path = std::fs::canonicalize(&self.path) - .map_err(|e| StorageErrorKind::Other(e.to_string()))?; - Ok(Arc::new( - LocalFileSystem::new_with_prefix(path) - .map_err(|e| StorageErrorKind::Other(e.to_string()))?, - )) - } - - fn prefix(&self) -> String { - "".to_string() - } - - fn artificially_sort_refs_in_mem(&self) -> bool { - true - } - - fn default_settings(&self) -> Settings { - Settings { - concurrency: Some(ConcurrencySettings { - max_concurrent_requests_for_object: Some( - NonZeroU16::new(5).unwrap_or(NonZeroU16::MIN), - ), - ideal_concurrent_request_size: Some( - NonZeroU64::new(4 * 1024).unwrap_or(NonZeroU64::MIN), - ), - }), - unsafe_use_conditional_update: Some(false), - unsafe_use_metadata: Some(false), - retries: Some(RetriesSettings { - max_tries: Some(NonZeroU16::new(1).unwrap_or(NonZeroU16::MIN)), - initial_backoff_ms: Some(0), - max_backoff_ms: Some(0), - }), - ..Default::default() - } - } -} - -#[derive(Debug, Serialize, Deserialize)] -pub struct HttpObjectStoreBackend { - pub url: String, - pub config: Option>, -} - -impl fmt::Display for HttpObjectStoreBackend { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!( - f, - "HttpObjectStoreBackend(url={}, config={})", - self.url, - self.config - .as_ref() - .map(|c| c - .iter() - .map(|(k, v)| format!("{k:?}={v}")) - .collect::>() - .join(", ")) - .unwrap_or("None".to_string()) - ) - } -} - -#[typetag::serde(name = "http_object_store_provider")] -impl ObjectStoreBackend for HttpObjectStoreBackend { - fn mk_object_store( - &self, - settings: &Settings, - ) -> Result, StorageError> { - let builder = HttpBuilder::new().with_url(&self.url); - - // Add options - let builder = self - .config - .as_ref() - .unwrap_or(&HashMap::new()) - .iter() - .fold(builder, |builder, (key, value)| builder.with_config(*key, value)); - - let builder = builder.with_retry(RetryConfig { - backoff: BackoffConfig { - init_backoff: core::time::Duration::from_millis( - settings.retries().initial_backoff_ms() as u64, - ), - max_backoff: core::time::Duration::from_millis( - settings.retries().max_backoff_ms() as u64, - ), - base: 2., - }, - max_retries: settings.retries().max_tries().get() as usize - 1, - retry_timeout: core::time::Duration::from_secs(5 * 60), - }); - - let store = - builder.build().map_err(|e| StorageErrorKind::Other(e.to_string()))?; - - Ok(Arc::new(store)) - } - - fn prefix(&self) -> String { - "".to_string() - } - - fn default_settings(&self) -> Settings { - Default::default() - } - - fn can_write(&self) -> bool { - // TODO: Support write operations? - false - } -} - -#[derive(Debug, Serialize, Deserialize)] -pub struct S3ObjectStoreBackend { - bucket: String, - prefix: Option, - credentials: Option, - config: Option, -} - -impl fmt::Display for S3ObjectStoreBackend { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!( - f, - "S3ObjectStoreBackend(bucket={}, prefix={}, config={})", - self.bucket, - self.prefix.as_deref().unwrap_or(""), - self.config.as_ref().map(|c| c.to_string()).unwrap_or("None".to_string()) - ) - } -} - -#[typetag::serde(name = "s3_object_store_provider")] -impl ObjectStoreBackend for S3ObjectStoreBackend { - fn mk_object_store( - &self, - settings: &Settings, - ) -> Result, StorageError> { - let builder = AmazonS3Builder::new(); - - let builder = match self.credentials.as_ref() { - Some(S3Credentials::Static(credentials)) => { - let builder = builder - .with_access_key_id(credentials.access_key_id.clone()) - .with_secret_access_key(credentials.secret_access_key.clone()); - - if let Some(session_token) = credentials.session_token.as_ref() { - builder.with_token(session_token.clone()) - } else { - builder - } - } - Some(S3Credentials::Anonymous) => builder.with_skip_signature(true), - // TODO: Support refreshable credentials - _ => AmazonS3Builder::from_env(), - }; - - let builder = if let Some(config) = self.config.as_ref() { - let builder = if let Some(region) = config.region.as_ref() { - builder.with_region(region.to_string()) - } else { - builder - }; - - let builder = if let Some(endpoint) = config.endpoint_url.as_ref() { - builder.with_endpoint(endpoint.to_string()) - } else { - builder - }; - - builder - .with_skip_signature(config.anonymous) - .with_allow_http(config.allow_http) - } else { - builder - }; - - // Defaults - let builder = builder - .with_bucket_name(&self.bucket) - .with_conditional_put(object_store::aws::S3ConditionalPut::ETagMatch); - - let builder = builder.with_retry(RetryConfig { - backoff: BackoffConfig { - init_backoff: core::time::Duration::from_millis( - settings.retries().initial_backoff_ms() as u64, - ), - max_backoff: core::time::Duration::from_millis( - settings.retries().max_backoff_ms() as u64, - ), - base: 2., - }, - max_retries: settings.retries().max_tries().get() as usize - 1, - retry_timeout: core::time::Duration::from_secs(5 * 60), - }); - - let store = - builder.build().map_err(|e| StorageErrorKind::Other(e.to_string()))?; - Ok(Arc::new(store)) - } - - fn prefix(&self) -> String { - self.prefix.clone().unwrap_or("".to_string()) - } - - fn default_settings(&self) -> Settings { - Default::default() - } -} - -#[derive(Debug, Serialize, Deserialize)] -pub struct AzureObjectStoreBackend { - account: String, - container: String, - prefix: Option, - credentials: Option, - config: Option>, -} - -impl fmt::Display for AzureObjectStoreBackend { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!( - f, - "AzureObjectStoreBackend(account={}, container={}, prefix={})", - self.account, - self.container, - self.prefix.as_deref().unwrap_or("") - ) - } -} - -#[typetag::serde(name = "azure_object_store_provider")] -impl ObjectStoreBackend for AzureObjectStoreBackend { - fn mk_object_store( - &self, - settings: &Settings, - ) -> Result, StorageError> { - let builder = MicrosoftAzureBuilder::new(); - - let builder = match self.credentials.as_ref() { - Some(AzureCredentials::Static(AzureStaticCredentials::AccessKey(key))) => { - builder.with_access_key(key) - } - Some(AzureCredentials::Static(AzureStaticCredentials::SASToken(token))) => { - builder.with_config(AzureConfigKey::SasKey, token) - } - Some(AzureCredentials::Static(AzureStaticCredentials::BearerToken( - token, - ))) => builder.with_bearer_token_authorization(token), - None | Some(AzureCredentials::FromEnv) => MicrosoftAzureBuilder::from_env(), - }; - - // Either the account name should be provided or user_emulator should be set to true to use the default account - let builder = - builder.with_account(&self.account).with_container_name(&self.container); - - let builder = self - .config - .as_ref() - .unwrap_or(&HashMap::new()) - .iter() - .fold(builder, |builder, (key, value)| builder.with_config(*key, value)); - - let builder = builder.with_retry(RetryConfig { - backoff: BackoffConfig { - init_backoff: core::time::Duration::from_millis( - settings.retries().initial_backoff_ms() as u64, - ), - max_backoff: core::time::Duration::from_millis( - settings.retries().max_backoff_ms() as u64, - ), - base: 2., - }, - max_retries: settings.retries().max_tries().get() as usize - 1, - retry_timeout: core::time::Duration::from_secs(5 * 60), - }); - - let store = - builder.build().map_err(|e| StorageErrorKind::Other(e.to_string()))?; - Ok(Arc::new(store)) - } - - fn prefix(&self) -> String { - self.prefix.clone().unwrap_or("".to_string()) - } - - fn default_settings(&self) -> Settings { - Default::default() - } -} - -#[derive(Debug, Serialize, Deserialize)] -pub struct GcsObjectStoreBackend { - pub bucket: String, - pub prefix: Option, - pub credentials: Option, - pub config: Option>, -} - -impl fmt::Display for GcsObjectStoreBackend { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!( - f, - "GcsObjectStoreBackend(bucket={}, prefix={})", - self.bucket, - self.prefix.as_deref().unwrap_or("") - ) - } -} - -#[typetag::serde(name = "gcs_object_store_provider")] -impl ObjectStoreBackend for GcsObjectStoreBackend { - fn mk_object_store( - &self, - settings: &Settings, - ) -> Result, StorageError> { - let builder = GoogleCloudStorageBuilder::new(); - - let builder = match self.credentials.as_ref() { - Some(GcsCredentials::Static(GcsStaticCredentials::ServiceAccount(path))) => { - let path = path.clone().into_os_string().into_string().map_err(|_| { - StorageErrorKind::Other("invalid service account path".to_string()) - })?; - builder.with_service_account_path(path) - } - Some(GcsCredentials::Static(GcsStaticCredentials::ServiceAccountKey( - key, - ))) => builder.with_service_account_key(key), - Some(GcsCredentials::Static( - GcsStaticCredentials::ApplicationCredentials(path), - )) => { - let path = path.clone().into_os_string().into_string().map_err(|_| { - StorageErrorKind::Other( - "invalid application credentials path".to_string(), - ) - })?; - builder.with_application_credentials(path) - } - Some(GcsCredentials::Static(GcsStaticCredentials::BearerToken(token))) => { - let provider = StaticCredentialProvider::new(GcpCredential::from(token)); - builder.with_credentials(Arc::new(provider)) - } - Some(GcsCredentials::Refreshable(fetcher)) => { - let credential_provider = - GcsRefreshableCredentialProvider::new(Arc::clone(fetcher)); - builder.with_credentials(Arc::new(credential_provider)) - } - Some(GcsCredentials::Anonymous) => builder.with_skip_signature(true), - None | Some(GcsCredentials::FromEnv) => GoogleCloudStorageBuilder::from_env(), - }; - - let builder = builder.with_bucket_name(&self.bucket); - - // Add options - let builder = self - .config - .as_ref() - .unwrap_or(&HashMap::new()) - .iter() - .fold(builder, |builder, (key, value)| builder.with_config(*key, value)); - - let builder = builder.with_retry(RetryConfig { - backoff: BackoffConfig { - init_backoff: core::time::Duration::from_millis( - settings.retries().initial_backoff_ms() as u64, - ), - max_backoff: core::time::Duration::from_millis( - settings.retries().max_backoff_ms() as u64, - ), - base: 2., - }, - max_retries: settings.retries().max_tries().get() as usize - 1, - retry_timeout: core::time::Duration::from_secs(5 * 60), - }); - let store = - builder.build().map_err(|e| StorageErrorKind::Other(e.to_string()))?; - Ok(Arc::new(store)) - } - - fn prefix(&self) -> String { - self.prefix.clone().unwrap_or("".to_string()) - } - - fn default_settings(&self) -> Settings { - Default::default() - } -} - -#[derive(Debug)] -pub struct GcsRefreshableCredentialProvider { - last_credential: Arc>>, - refresher: Arc, -} - -impl GcsRefreshableCredentialProvider { - pub fn new(refresher: Arc) -> Self { - Self { last_credential: Arc::new(RwLock::new(None)), refresher } - } - - pub async fn get_or_update_credentials( - &self, - ) -> Result { - let last_credential = self.last_credential.read().await; - - // If we have a credential and it hasn't expired, return it - if let Some(creds) = last_credential.as_ref() { - if let Some(expires_after) = creds.expires_after { - if expires_after - > Utc::now() + TimeDelta::seconds(rand::random_range(120..=180)) - { - return Ok(creds.clone()); - } - } - } - - drop(last_credential); - let mut last_credential = self.last_credential.write().await; - - // Otherwise, refresh the credential and cache it - let creds = self - .refresher - .get() - .await - .map_err(|e| StorageErrorKind::Other(e.to_string()))?; - *last_credential = Some(creds.clone()); - Ok(creds) - } -} - -#[async_trait] -impl CredentialProvider for GcsRefreshableCredentialProvider { - type Credential = GcpCredential; - - async fn get_credential(&self) -> object_store::Result> { - let creds = self.get_or_update_credentials().await.map_err(|e| { - object_store::Error::Generic { store: "gcp", source: Box::new(e) } - })?; - Ok(Arc::new(GcpCredential::from(&creds))) - } -} - -fn object_to_list_info(object: &ObjectMeta) -> Option> { - let created_at = object.last_modified; - let id = object.location.filename()?.to_string(); - let size_bytes = object.size; - Some(ListInfo { id, created_at, size_bytes }) -} - -#[cfg(test)] -#[allow(clippy::expect_used, clippy::unwrap_used)] -mod tests { - use std::path::PathBuf; - - use icechunk_macros::tokio_test; - use tempfile::TempDir; - - use crate::format::{ChunkId, ManifestId, SnapshotId}; - - use super::ObjectStorage; - - #[tokio_test] - async fn test_serialize_object_store() { - let tmp_dir = TempDir::new().unwrap(); - let store = ObjectStorage::new_local_filesystem(tmp_dir.path()).await.unwrap(); - - let serialized = serde_json::to_string(&store).unwrap(); - - let deserialized: ObjectStorage = serde_json::from_str(&serialized).unwrap(); - assert_eq!( - store.backend.as_ref().prefix(), - deserialized.backend.as_ref().prefix() - ); - } - - struct TestLocalPath(String); - - impl From<&TestLocalPath> for std::path::PathBuf { - fn from(path: &TestLocalPath) -> Self { - std::path::PathBuf::from(&path.0) - } - } - - impl Drop for TestLocalPath { - fn drop(&mut self) { - let _ = std::fs::remove_dir_all(&self.0); - } - } - - #[tokio_test] - async fn test_canonicalize_path() { - // Absolute path - let tmp_dir = TempDir::new().unwrap(); - let store = ObjectStorage::new_local_filesystem(tmp_dir.path()).await; - assert!(store.is_ok()); - - // Relative path - let rel_path = "relative/path"; - let store = - ObjectStorage::new_local_filesystem(PathBuf::from(&rel_path).as_path()).await; - assert!(store.is_ok()); - - // Relative with leading ./ - let rel_path = TestLocalPath("./other/path".to_string()); - let store = - ObjectStorage::new_local_filesystem(PathBuf::from(&rel_path).as_path()).await; - assert!(store.is_ok()); - } - - #[tokio_test] - async fn test_object_store_paths() { - let store = ObjectStorage::new_local_filesystem(PathBuf::from(".").as_path()) - .await - .unwrap(); - - let ref_key = "ref_key"; - let ref_path = store.ref_key(ref_key); - assert_eq!(ref_path.to_string(), format!("refs/{ref_key}")); - - let snapshot_id = SnapshotId::random(); - let snapshot_path = store.get_snapshot_path(&snapshot_id); - assert_eq!(snapshot_path.to_string(), format!("snapshots/{snapshot_id}")); - - let manifest_id = ManifestId::random(); - let manifest_path = store.get_manifest_path(&manifest_id); - assert_eq!(manifest_path.to_string(), format!("manifests/{manifest_id}")); - - let chunk_id = ChunkId::random(); - let chunk_path = store.get_chunk_path(&chunk_id); - assert_eq!(chunk_path.to_string(), format!("chunks/{chunk_id}")); - - let transaction_id = SnapshotId::random(); - let transaction_path = store.get_transaction_path(&transaction_id); - assert_eq!( - transaction_path.to_string(), - format!("transactions/{transaction_id}") - ); - } -} diff --git a/icechunk/src/storage/s3.rs b/icechunk/src/storage/s3.rs deleted file mode 100644 index 86fe0c20d..000000000 --- a/icechunk/src/storage/s3.rs +++ /dev/null @@ -1,1144 +0,0 @@ -use std::{ - collections::HashMap, - fmt, - future::ready, - ops::Range, - path::{Path, PathBuf}, - sync::Arc, -}; - -use crate::{ - Storage, StorageError, - config::{S3Credentials, S3CredentialsFetcher, S3Options}, - format::{ChunkId, ChunkOffset, FileTypeTag, ManifestId, ObjectId, SnapshotId}, - private, -}; -use async_trait::async_trait; -use aws_config::{ - AppName, BehaviorVersion, - meta::region::RegionProviderChain, - retry::{ProvideErrorKind, RetryConfig}, -}; -use aws_credential_types::provider::error::CredentialsError; -use aws_sdk_s3::{ - Client, - config::{ - Builder, ConfigBag, IdentityCache, Intercept, ProvideCredentials, Region, - RuntimeComponents, interceptors::BeforeTransmitInterceptorContextMut, - }, - error::{BoxError, SdkError}, - operation::put_object::PutObjectError, - primitives::ByteStream, - types::{CompletedMultipartUpload, CompletedPart, Delete, Object, ObjectIdentifier}, -}; -use aws_smithy_types_convert::{date_time::DateTimeExt, stream::PaginationStreamExt}; -use bytes::{Buf, Bytes}; -use chrono::{DateTime, Utc}; -use futures::{ - StreamExt, TryStreamExt, - stream::{self, BoxStream, FuturesOrdered}, -}; -use serde::{Deserialize, Serialize}; -use tokio::{io::AsyncRead, sync::OnceCell}; -use tracing::{error, instrument}; - -use super::{ - CHUNK_PREFIX, CONFIG_PATH, DeleteObjectsResult, FetchConfigResult, GetRefResult, - ListInfo, MANIFEST_PREFIX, REF_PREFIX, Reader, SNAPSHOT_PREFIX, Settings, - StorageErrorKind, StorageResult, TRANSACTION_PREFIX, UpdateConfigResult, VersionInfo, - WriteRefResult, split_in_multiple_equal_requests, -}; - -#[derive(Debug, Serialize, Deserialize)] -pub struct S3Storage { - // config and credentials are stored so we are able to serialize and deserialize the struct - config: S3Options, - credentials: S3Credentials, - bucket: String, - prefix: String, - can_write: bool, - extra_read_headers: Vec<(String, String)>, - extra_write_headers: Vec<(String, String)>, - #[serde(skip)] - /// We need to use OnceCell to allow async initialization, because serde - /// does not support async cfunction calls from deserialization. This gives - /// us a way to lazily initialize the client. - client: OnceCell>, -} - -impl fmt::Display for S3Storage { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!( - f, - "S3Storage(bucket={}, prefix={}, config={})", - self.bucket, self.prefix, self.config, - ) - } -} -#[derive(Debug)] -struct ExtraHeadersInterceptor { - extra_read_headers: Vec<(String, String)>, - extra_write_headers: Vec<(String, String)>, -} - -impl Intercept for ExtraHeadersInterceptor { - fn name(&self) -> &'static str { - "ExtraHeaders" - } - - fn modify_before_retry_loop( - &self, - context: &mut BeforeTransmitInterceptorContextMut<'_>, - _runtime_components: &RuntimeComponents, - _cfg: &mut ConfigBag, - ) -> Result<(), BoxError> { - let request = context.request_mut(); - match request.method() { - "GET" | "HEAD" | "OPTIONS" | "TRACE" => { - for (k, v) in self.extra_read_headers.iter() { - request.headers_mut().insert(k.clone(), v.clone()); - } - } - _ => { - for (k, v) in self.extra_write_headers.iter() { - request.headers_mut().insert(k.clone(), v.clone()); - } - } - } - Ok(()) - } -} - -#[instrument(skip(credentials))] -pub async fn mk_client( - config: &S3Options, - credentials: S3Credentials, - extra_read_headers: Vec<(String, String)>, - extra_write_headers: Vec<(String, String)>, - settings: &Settings, -) -> Client { - let region = config - .region - .as_ref() - .map(|r| RegionProviderChain::first_try(Some(Region::new(r.clone())))) - .unwrap_or_else(RegionProviderChain::default_provider); - - let endpoint = config.endpoint_url.clone(); - let region = if endpoint.is_some() { - // GH793, the S3 SDK requires a region even though it may not make sense - // for S3-compatible object stores like Tigris or Ceph. - // So we set a fake region, using the `endpoint_url` as a sign that - // we are not talking to real S3 - region.or_else(Region::new("region-was-not-set")) - } else { - region - }; - - #[allow(clippy::unwrap_used)] - let app_name = AppName::new("icechunk").unwrap(); - let mut aws_config = aws_config::defaults(BehaviorVersion::v2025_01_17()) - .region(region) - .app_name(app_name); - - if let Some(endpoint) = endpoint { - aws_config = aws_config.endpoint_url(endpoint) - } - - match credentials { - S3Credentials::FromEnv => {} - S3Credentials::Anonymous => aws_config = aws_config.no_credentials(), - S3Credentials::Static(credentials) => { - aws_config = - aws_config.credentials_provider(aws_credential_types::Credentials::new( - credentials.access_key_id, - credentials.secret_access_key, - credentials.session_token, - credentials.expires_after.map(|e| e.into()), - "user", - )); - } - S3Credentials::Refreshable(fetcher) => { - aws_config = - aws_config.credentials_provider(ProvideRefreshableCredentials(fetcher)); - } - } - - let retry_config = RetryConfig::standard() - .with_max_attempts(settings.retries().max_tries().get() as u32) - .with_initial_backoff(core::time::Duration::from_millis( - settings.retries().initial_backoff_ms() as u64, - )) - .with_max_backoff(core::time::Duration::from_millis( - settings.retries().max_backoff_ms() as u64, - )); - - let mut s3_builder = Builder::from(&aws_config.load().await) - .force_path_style(config.force_path_style) - .retry_config(retry_config); - - // credentials may take a while to refresh, defaults are too strict - let id_cache = IdentityCache::lazy() - .load_timeout(core::time::Duration::from_secs(120)) - .buffer_time(core::time::Duration::from_secs(120)) - .build(); - - s3_builder = s3_builder.identity_cache(id_cache); - - if !extra_read_headers.is_empty() || !extra_write_headers.is_empty() { - s3_builder = s3_builder.interceptor(ExtraHeadersInterceptor { - extra_read_headers, - extra_write_headers, - }) - } - - let config = s3_builder.build(); - - Client::from_conf(config) -} - -impl S3Storage { - pub fn new( - config: S3Options, - bucket: String, - prefix: Option, - credentials: S3Credentials, - can_write: bool, - extra_read_headers: Vec<(String, String)>, - extra_write_headers: Vec<(String, String)>, - ) -> Result { - let client = OnceCell::new(); - Ok(S3Storage { - client, - config, - bucket, - prefix: prefix.unwrap_or_default(), - credentials, - can_write, - extra_read_headers, - extra_write_headers, - }) - } - - /// Get the client, initializing it if it hasn't been initialized yet. This is necessary because the - /// client is not serializeable and must be initialized after deserialization. Under normal construction - /// the original client is returned immediately. - #[instrument(skip_all)] - async fn get_client(&self, settings: &Settings) -> &Arc { - self.client - .get_or_init(|| async { - Arc::new( - mk_client( - &self.config, - self.credentials.clone(), - self.extra_read_headers.clone(), - self.extra_write_headers.clone(), - settings, - ) - .await, - ) - }) - .await - } - - fn get_path_str(&self, file_prefix: &str, id: &str) -> StorageResult { - let path = PathBuf::from_iter([self.prefix.as_str(), file_prefix, id]); - let path_str = - path.into_os_string().into_string().map_err(StorageErrorKind::BadPrefix)?; - - Ok(path_str.replace("\\", "/")) - } - - fn get_path( - &self, - file_prefix: &str, - id: &ObjectId, - ) -> StorageResult { - // we serialize the url using crockford - self.get_path_str(file_prefix, id.to_string().as_str()) - } - - fn get_config_path(&self) -> StorageResult { - self.get_path_str("", CONFIG_PATH) - } - - fn get_snapshot_path(&self, id: &SnapshotId) -> StorageResult { - self.get_path(SNAPSHOT_PREFIX, id) - } - - fn get_manifest_path(&self, id: &ManifestId) -> StorageResult { - self.get_path(MANIFEST_PREFIX, id) - } - - fn get_chunk_path(&self, id: &ChunkId) -> StorageResult { - self.get_path(CHUNK_PREFIX, id) - } - - fn get_transaction_path(&self, id: &SnapshotId) -> StorageResult { - self.get_path(TRANSACTION_PREFIX, id) - } - - fn ref_key(&self, ref_key: &str) -> StorageResult { - let path = PathBuf::from_iter([self.prefix.as_str(), REF_PREFIX, ref_key]); - let path_str = - path.into_os_string().into_string().map_err(StorageErrorKind::BadPrefix)?; - - Ok(path_str.replace("\\", "/")) - } - - async fn get_object_reader( - &self, - settings: &Settings, - key: &str, - ) -> StorageResult> { - let client = self.get_client(settings).await; - let b = client.get_object().bucket(self.bucket.as_str()).key(key); - Ok(Box::new(b.send().await.map_err(Box::new)?.body.into_async_read())) - } - - async fn put_object_single< - I: IntoIterator, impl Into)>, - >( - &self, - settings: &Settings, - key: &str, - content_type: Option>, - metadata: I, - storage_class: Option<&String>, - bytes: impl Into, - ) -> StorageResult<()> { - let mut b = self - .get_client(settings) - .await - .put_object() - .bucket(self.bucket.clone()) - .key(key); - - if settings.unsafe_use_metadata() { - if let Some(ct) = content_type { - b = b.content_type(ct) - }; - } - - if settings.unsafe_use_metadata() { - for (k, v) in metadata { - b = b.metadata(k, v); - } - } - - if let Some(klass) = storage_class { - let klass = klass.as_str().into(); - b = b.storage_class(klass); - } - - b.body(bytes.into()).send().await.map_err(Box::new)?; - Ok(()) - } - - async fn put_object_multipart< - I: IntoIterator, impl Into)>, - >( - &self, - settings: &Settings, - key: &str, - content_type: Option>, - metadata: I, - storage_class: Option<&String>, - bytes: &Bytes, - ) -> StorageResult<()> { - let mut multi = self - .get_client(settings) - .await - .create_multipart_upload() - // We would like this, but it fails in MinIO - //.checksum_type(aws_sdk_s3::types::ChecksumType::FullObject) - //.checksum_algorithm(aws_sdk_s3::types::ChecksumAlgorithm::Crc64Nvme) - .bucket(self.bucket.clone()) - .key(key); - - if settings.unsafe_use_metadata() { - if let Some(ct) = content_type { - multi = multi.content_type(ct) - }; - for (k, v) in metadata { - multi = multi.metadata(k, v); - } - } - - if let Some(klass) = storage_class { - let klass = klass.as_str().into(); - multi = multi.storage_class(klass); - } - - let create_res = multi.send().await.map_err(Box::new)?; - let upload_id = - create_res.upload_id().ok_or(StorageError::from(StorageErrorKind::Other( - "No upload_id in create multipart upload result".to_string(), - )))?; - - // We need to ensure all requests are the same size except for the last one, which can be - // smaller. This is a requirement for R2 compatibility - let parts = split_in_multiple_equal_requests( - &(0..bytes.len() as u64), - settings.concurrency().ideal_concurrent_request_size().get(), - settings.concurrency().max_concurrent_requests_for_object().get(), - ) - .collect::>(); - - let results = parts - .into_iter() - .enumerate() - .map(|(part_idx, range)| async move { - let body = bytes.slice(range.start as usize..range.end as usize).into(); - let idx = part_idx as i32 + 1; - self.get_client(settings) - .await - .upload_part() - .upload_id(upload_id) - .bucket(self.bucket.clone()) - .key(key) - .part_number(idx) - .body(body) - .send() - .await - .map(|res| (idx, res)) - }) - .collect::>(); - - let completed_parts = results - .map_ok(|(idx, res)| { - let etag = res.e_tag().unwrap_or(""); - CompletedPart::builder() - .e_tag(strip_quotes(etag)) - .part_number(idx) - .build() - }) - .try_collect::>() - .await - .map_err(Box::new)?; - - let completed_parts = - CompletedMultipartUpload::builder().set_parts(Some(completed_parts)).build(); - - self.get_client(settings) - .await - .complete_multipart_upload() - .bucket(self.bucket.clone()) - .key(key) - .upload_id(upload_id) - //.checksum_type(aws_sdk_s3::types::ChecksumType::FullObject) - .multipart_upload(completed_parts) - .send() - .await - .map_err(Box::new)?; - - Ok(()) - } - - async fn put_object< - I: IntoIterator, impl Into)>, - >( - &self, - settings: &Settings, - key: &str, - content_type: Option>, - metadata: I, - storage_class: Option<&String>, - bytes: &Bytes, - ) -> StorageResult<()> { - if bytes.len() >= settings.minimum_size_for_multipart_upload() as usize { - self.put_object_multipart( - settings, - key, - content_type, - metadata, - storage_class, - bytes, - ) - .await - } else { - self.put_object_single( - settings, - key, - content_type, - metadata, - storage_class, - bytes.clone(), - ) - .await - } - } - - fn get_ref_name<'a>(&self, key: Option<&'a str>) -> Option<&'a str> { - let key = key?; - let prefix = self.ref_key("").ok()?; - let relative_key = key.strip_prefix(&prefix)?; - let ref_name = relative_key.split('/').next()?; - Some(ref_name) - } -} - -pub fn range_to_header(range: &Range) -> String { - format!("bytes={}-{}", range.start, range.end - 1) -} - -impl private::Sealed for S3Storage {} - -#[async_trait] -#[typetag::serde] -impl Storage for S3Storage { - fn can_write(&self) -> bool { - self.can_write - } - - #[instrument(skip_all)] - async fn fetch_config( - &self, - settings: &Settings, - ) -> StorageResult { - let key = self.get_config_path()?; - let res = self - .get_client(settings) - .await - .get_object() - .bucket(self.bucket.clone()) - .key(key) - .send() - .await; - - match res { - Ok(output) => match output.e_tag { - Some(etag) => Ok(FetchConfigResult::Found { - bytes: output.body.collect().await.map_err(Box::new)?.into_bytes(), - version: VersionInfo::from_etag_only(etag), - }), - None => Ok(FetchConfigResult::NotFound), - }, - Err(sdk_err) => match sdk_err.as_service_error() { - Some(e) if e.is_no_such_key() => Ok(FetchConfigResult::NotFound), - Some(_) - if sdk_err - .raw_response() - .is_some_and(|x| x.status().as_u16() == 404) => - { - // needed for Cloudflare R2 public bucket URLs - // if config doesn't exist we get a 404 that isn't parsed by the AWS SDK - // into anything useful. So we need to parse the raw response, and match - // the status code. - Ok(FetchConfigResult::NotFound) - } - _ => Err(Box::new(sdk_err).into()), - }, - } - } - - #[instrument(skip(self, settings, config))] - async fn update_config( - &self, - settings: &Settings, - config: Bytes, - previous_version: &VersionInfo, - ) -> StorageResult { - let key = self.get_config_path()?; - let mut req = self - .get_client(settings) - .await - .put_object() - .bucket(self.bucket.clone()) - .key(key) - .body(config.into()); - - if settings.unsafe_use_metadata() { - req = req.content_type("application/yaml") - } - - if let Some(klass) = settings.metadata_storage_class() { - req = req.storage_class(klass.as_str().into()) - } - - match ( - previous_version.etag(), - settings.unsafe_use_conditional_create(), - settings.unsafe_use_conditional_update(), - ) { - (None, true, _) => req = req.if_none_match("*"), - (Some(etag), _, true) => req = req.if_match(strip_quotes(etag)), - (_, _, _) => {} - } - - let res = req.send().await; - - match res { - Ok(out) => { - let new_etag = out - .e_tag() - .ok_or(StorageErrorKind::Other( - "Config object should have an etag".to_string(), - ))? - .to_string(); - let new_version = VersionInfo::from_etag_only(new_etag); - Ok(UpdateConfigResult::Updated { new_version }) - } - // minio returns this - Err(SdkError::ServiceError(err)) => { - if err.err().meta().code() == Some("PreconditionFailed") { - Ok(UpdateConfigResult::NotOnLatestVersion) - } else { - Err(StorageError::from(Box::new( - SdkError::::ServiceError(err), - ))) - } - } - // S3 API documents this - Err(SdkError::ResponseError(err)) => { - let status = err.raw().status().as_u16(); - // see https://docs.aws.amazon.com/AmazonS3/latest/API/API_PutObject.html#API_PutObject_RequestSyntax - if status == 409 || status == 412 { - Ok(UpdateConfigResult::NotOnLatestVersion) - } else { - Err(StorageError::from(Box::new( - SdkError::::ResponseError(err), - ))) - } - } - Err(err) => Err(Box::new(err).into()), - } - } - - #[instrument(skip(self, settings))] - async fn fetch_snapshot( - &self, - settings: &Settings, - id: &SnapshotId, - ) -> StorageResult> { - let key = self.get_snapshot_path(id)?; - self.get_object_reader(settings, key.as_str()).await - } - - #[instrument(skip(self, settings))] - async fn fetch_manifest_known_size( - &self, - settings: &Settings, - id: &ManifestId, - size: u64, - ) -> StorageResult { - let key = self.get_manifest_path(id)?; - self.get_object_concurrently(settings, key.as_str(), &(0..size)).await - } - - #[instrument(skip(self, settings))] - async fn fetch_manifest_unknown_size( - &self, - settings: &Settings, - id: &ManifestId, - ) -> StorageResult> { - let key = self.get_manifest_path(id)?; - self.get_object_reader(settings, key.as_str()).await - } - - #[instrument(skip(self, settings))] - async fn fetch_transaction_log( - &self, - settings: &Settings, - id: &SnapshotId, - ) -> StorageResult> { - let key = self.get_transaction_path(id)?; - self.get_object_reader(settings, key.as_str()).await - } - - #[instrument(skip(self, settings))] - async fn fetch_chunk( - &self, - settings: &Settings, - id: &ChunkId, - range: &Range, - ) -> StorageResult { - let key = self.get_chunk_path(id)?; - self.get_object_concurrently(settings, key.as_str(), range) - .await? - .to_bytes((range.end - range.start) as usize) - .await - } - - #[instrument(skip(self, settings, metadata, bytes))] - async fn write_snapshot( - &self, - settings: &Settings, - id: SnapshotId, - metadata: Vec<(String, String)>, - bytes: Bytes, - ) -> StorageResult<()> { - let key = self.get_snapshot_path(&id)?; - self.put_object( - settings, - key.as_str(), - None::, - metadata, - settings.metadata_storage_class(), - &bytes, - ) - .await - } - - #[instrument(skip(self, settings, metadata, bytes))] - async fn write_manifest( - &self, - settings: &Settings, - id: ManifestId, - metadata: Vec<(String, String)>, - bytes: Bytes, - ) -> StorageResult<()> { - let key = self.get_manifest_path(&id)?; - self.put_object( - settings, - key.as_str(), - None::, - metadata.into_iter(), - settings.metadata_storage_class(), - &bytes, - ) - .await - } - - #[instrument(skip(self, settings, metadata, bytes))] - async fn write_transaction_log( - &self, - settings: &Settings, - id: SnapshotId, - metadata: Vec<(String, String)>, - bytes: Bytes, - ) -> StorageResult<()> { - let key = self.get_transaction_path(&id)?; - self.put_object( - settings, - key.as_str(), - None::, - metadata.into_iter(), - settings.metadata_storage_class(), - &bytes, - ) - .await - } - - #[instrument(skip(self, settings, bytes))] - async fn write_chunk( - &self, - settings: &Settings, - id: ChunkId, - bytes: bytes::Bytes, - ) -> Result<(), StorageError> { - let key = self.get_chunk_path(&id)?; - let metadata: [(String, String); 0] = []; - self.put_object( - settings, - key.as_str(), - None::, - metadata, - settings.chunks_storage_class(), - &bytes, - ) - .await - } - - #[instrument(skip(self, settings))] - async fn get_ref( - &self, - settings: &Settings, - ref_key: &str, - ) -> StorageResult { - let key = self.ref_key(ref_key)?; - let res = self - .get_client(settings) - .await - .get_object() - .bucket(self.bucket.clone()) - .key(key.clone()) - .send() - .await; - - match res { - Ok(res) => { - let bytes = res.body.collect().await.map_err(Box::new)?.into_bytes(); - if let Some(version) = res.e_tag.map(VersionInfo::from_etag_only) { - Ok(GetRefResult::Found { bytes, version }) - } else { - Ok(GetRefResult::NotFound) - } - } - Err(err) - if err - .as_service_error() - .map(|e| e.is_no_such_key()) - .unwrap_or(false) => - { - Ok(GetRefResult::NotFound) - } - Err(err) => Err(Box::new(err).into()), - } - } - - #[instrument(skip_all)] - async fn ref_names(&self, settings: &Settings) -> StorageResult> { - let prefix = self.ref_key("")?; - let mut paginator = self - .get_client(settings) - .await - .list_objects_v2() - .bucket(self.bucket.clone()) - .prefix(prefix.clone()) - .into_paginator() - .send(); - - let mut res = Vec::new(); - - while let Some(page) = paginator.try_next().await.map_err(Box::new)? { - for obj in page.contents.unwrap_or_else(Vec::new) { - let name = self.get_ref_name(obj.key()); - if let Some(name) = name { - res.push(name.to_string()); - } else { - tracing::error!(object = ?obj, "Bad ref name") - } - } - } - - Ok(res) - } - - #[instrument(skip(self, settings, bytes))] - async fn write_ref( - &self, - settings: &Settings, - ref_key: &str, - bytes: Bytes, - previous_version: &VersionInfo, - ) -> StorageResult { - let key = self.ref_key(ref_key)?; - let mut builder = self - .get_client(settings) - .await - .put_object() - .bucket(self.bucket.clone()) - .key(key.clone()); - - match ( - previous_version.etag(), - settings.unsafe_use_conditional_create(), - settings.unsafe_use_conditional_update(), - ) { - (None, true, _) => { - builder = builder.if_none_match("*"); - } - (Some(etag), _, true) => { - builder = builder.if_match(strip_quotes(etag)); - } - (_, _, _) => {} - } - - if let Some(klass) = settings.metadata_storage_class() { - builder = builder.storage_class(klass.as_str().into()) - } - - let res = builder.body(bytes.into()).send().await; - - match res { - Ok(_) => Ok(WriteRefResult::Written), - Err(err) => { - let code = err.as_service_error().and_then(|e| e.code()).unwrap_or(""); - if code.contains("PreconditionFailed") - || code.contains("ConditionalRequestConflict") - { - Ok(WriteRefResult::WontOverwrite) - } else { - Err(Box::new(err).into()) - } - } - } - } - - #[instrument(skip(self, settings))] - async fn list_objects<'a>( - &'a self, - settings: &Settings, - prefix: &str, - ) -> StorageResult>>> { - let prefix = format!("{}/{}", self.prefix, prefix).replace("//", "/"); - let stream = self - .get_client(settings) - .await - .list_objects_v2() - .bucket(self.bucket.clone()) - .prefix(prefix) - .into_paginator() - .send() - .into_stream_03x() - .map_err(Box::new) - .try_filter_map(|page| { - let contents = page.contents.map(|cont| stream::iter(cont).map(Ok)); - ready(Ok(contents)) - }) - .try_flatten() - .try_filter_map(|object| async move { - let info = object_to_list_info(&object); - if info.is_none() { - tracing::error!(object=?object, "Found bad object while listing"); - } - Ok(info) - }); - Ok(stream.boxed()) - } - - #[instrument(skip(self, batch))] - async fn delete_batch( - &self, - settings: &Settings, - prefix: &str, - batch: Vec<(String, u64)>, - ) -> StorageResult { - let mut sizes = HashMap::new(); - let mut ids = Vec::new(); - for (id, size) in batch.into_iter() { - if let Ok(key) = self.get_path_str(prefix, id.as_str()) { - if let Ok(ident) = ObjectIdentifier::builder().key(key.clone()).build() { - ids.push(ident); - sizes.insert(key, size); - } - } - } - - let delete = Delete::builder() - .set_objects(Some(ids)) - .build() - .map_err(|e| StorageErrorKind::Other(e.to_string()))?; - - let res = self - .get_client(settings) - .await - .delete_objects() - .bucket(self.bucket.clone()) - .delete(delete) - .send() - .await - .map_err(Box::new)?; - - if let Some(err) = res.errors.as_ref().and_then(|e| e.first()) { - tracing::error!( - error = ?err, - "Errors deleting objects", - ); - } - - let mut result = DeleteObjectsResult::default(); - for deleted in res.deleted() { - if let Some(key) = deleted.key() { - let size = sizes.get(key).unwrap_or(&0); - result.deleted_bytes += *size; - result.deleted_objects += 1; - } else { - tracing::error!("Deleted object without key"); - } - } - Ok(result) - } - - #[instrument(skip(self, settings))] - async fn get_snapshot_last_modified( - &self, - settings: &Settings, - snapshot: &SnapshotId, - ) -> StorageResult> { - let key = self.get_snapshot_path(snapshot)?; - let res = self - .get_client(settings) - .await - .head_object() - .bucket(self.bucket.clone()) - .key(key) - .send() - .await - .map_err(Box::new)?; - - let res = res.last_modified.ok_or(StorageErrorKind::Other( - "Object has no last_modified field".to_string(), - ))?; - let res = res.to_chrono_utc().map_err(|_| { - StorageErrorKind::Other("Invalid metadata timestamp".to_string()) - })?; - - Ok(res) - } - - #[instrument(skip(self))] - async fn get_object_range_buf( - &self, - settings: &Settings, - key: &str, - range: &Range, - ) -> StorageResult> { - let b = self - .get_client(settings) - .await - .get_object() - .bucket(self.bucket.clone()) - .key(key) - .range(range_to_header(range)); - Ok(Box::new( - b.send().await.map_err(Box::new)?.body.collect().await.map_err(Box::new)?, - )) - } - - #[instrument(skip(self))] - async fn get_object_range_read( - &self, - settings: &Settings, - key: &str, - range: &Range, - ) -> StorageResult> { - let client = self.get_client(settings).await; - let bucket = self.bucket.clone(); - Ok(Box::new(get_object_range(client.as_ref(), bucket, key, range).await?)) - } -} - -fn object_to_list_info(object: &Object) -> Option> { - let key = object.key()?; - let last_modified = object.last_modified()?; - let created_at = last_modified.to_chrono_utc().ok()?; - let id = Path::new(key).file_name().and_then(|s| s.to_str())?.to_string(); - let size_bytes = object.size.unwrap_or(0) as u64; - Some(ListInfo { id, created_at, size_bytes }) -} - -#[derive(Debug)] -struct ProvideRefreshableCredentials(Arc); - -impl ProvideCredentials for ProvideRefreshableCredentials { - fn provide_credentials<'a>( - &'a self, - ) -> aws_credential_types::provider::future::ProvideCredentials<'a> - where - Self: 'a, - { - aws_credential_types::provider::future::ProvideCredentials::new(self.provide()) - } -} - -impl ProvideRefreshableCredentials { - async fn provide( - &self, - ) -> Result { - let creds = self - .0 - .get() - .await - .inspect_err(|err| error!(error = err, "Cannot load credentials")) - .map_err(CredentialsError::not_loaded)?; - let creds = aws_credential_types::Credentials::new( - creds.access_key_id, - creds.secret_access_key, - creds.session_token, - creds.expires_after.map(|e| e.into()), - "user", - ); - Ok(creds) - } -} - -async fn get_object_range( - client: &Client, - bucket: String, - key: &str, - range: &Range, -) -> StorageResult> { - let b = client.get_object().bucket(bucket).key(key).range(range_to_header(range)); - Ok(b.send().await.map_err(Box::new)?.body.into_async_read()) -} - -fn strip_quotes(s: &str) -> &str { - s.strip_prefix('"').and_then(|s| s.strip_suffix('"')).unwrap_or(s) -} - -#[cfg(test)] -#[allow(clippy::unwrap_used)] -mod tests { - use icechunk_macros::tokio_test; - - use crate::config::{S3Credentials, S3Options, S3StaticCredentials}; - - use super::*; - - #[tokio_test] - async fn test_serialize_s3_storage() { - let config = S3Options { - region: Some("us-west-2".to_string()), - endpoint_url: Some("http://localhost:9000".to_string()), - allow_http: true, - anonymous: false, - force_path_style: false, - }; - let credentials = S3Credentials::Static(S3StaticCredentials { - access_key_id: "access_key_id".to_string(), - secret_access_key: "secret_access_key".to_string(), - session_token: Some("session_token".to_string()), - expires_after: None, - }); - let storage = S3Storage::new( - config, - "bucket".to_string(), - Some("prefix".to_string()), - credentials, - true, - Vec::new(), - Vec::new(), - ) - .unwrap(); - - let serialized = serde_json::to_string(&storage).unwrap(); - - assert_eq!( - serialized, - r#"{"config":{"region":"us-west-2","endpoint_url":"http://localhost:9000","anonymous":false,"allow_http":true,"force_path_style":false},"credentials":{"s3_credential_type":"static","access_key_id":"access_key_id","secret_access_key":"secret_access_key","session_token":"session_token","expires_after":null},"bucket":"bucket","prefix":"prefix","can_write":true,"extra_read_headers":[],"extra_write_headers":[]}"# - ); - - let deserialized: S3Storage = serde_json::from_str(&serialized).unwrap(); - assert_eq!(storage.config, deserialized.config); - } - - #[tokio_test] - async fn test_s3_paths() { - let storage = S3Storage::new( - S3Options { - region: Some("us-west-2".to_string()), - endpoint_url: None, - allow_http: true, - anonymous: false, - force_path_style: false, - }, - "bucket".to_string(), - Some("prefix".to_string()), - S3Credentials::FromEnv, - true, - Vec::new(), - Vec::new(), - ) - .unwrap(); - - let ref_path = storage.ref_key("ref_key").unwrap(); - assert_eq!(ref_path, "prefix/refs/ref_key"); - - let snapshot_id = SnapshotId::random(); - let snapshot_path = storage.get_snapshot_path(&snapshot_id).unwrap(); - assert_eq!(snapshot_path, format!("prefix/snapshots/{snapshot_id}")); - - let manifest_id = ManifestId::random(); - let manifest_path = storage.get_manifest_path(&manifest_id).unwrap(); - assert_eq!(manifest_path, format!("prefix/manifests/{manifest_id}")); - - let chunk_id = ChunkId::random(); - let chunk_path = storage.get_chunk_path(&chunk_id).unwrap(); - assert_eq!(chunk_path, format!("prefix/chunks/{chunk_id}")); - - let transaction_id = SnapshotId::random(); - let transaction_path = storage.get_transaction_path(&transaction_id).unwrap(); - assert_eq!(transaction_path, format!("prefix/transactions/{transaction_id}")); - } -} diff --git a/icechunk/src/virtual_chunks.rs b/icechunk/src/virtual_chunks.rs index e6b716ce2..b89eb30e7 100644 --- a/icechunk/src/virtual_chunks.rs +++ b/icechunk/src/virtual_chunks.rs @@ -40,7 +40,7 @@ use crate::{ use crate::format::manifest::SecondsSinceEpoch; #[cfg(not(target_arch = "wasm32"))] -use crate::storage::implementations::{ +use crate::storage::backends::{ object_store::{ GcsObjectStoreBackend, HttpObjectStoreBackend, ObjectStoreBackend as _, }, diff --git a/icechunk/tests/test_storage.rs b/icechunk/tests/test_storage.rs index beb027434..edd81aee1 100644 --- a/icechunk/tests/test_storage.rs +++ b/icechunk/tests/test_storage.rs @@ -17,7 +17,7 @@ use icechunk::{ }, storage::{ self, ETag, FetchConfigResult, Generation, StorageResult, UpdateConfigResult, - VersionInfo, new_in_memory_storage, new_s3_storage, s3::mk_client, + VersionInfo, backends::s3::mk_client, new_in_memory_storage, new_s3_storage, }, }; use icechunk_macros::tokio_test; diff --git a/icechunk/tests/test_virtual_refs.rs b/icechunk/tests/test_virtual_refs.rs index a267a2801..70c316086 100644 --- a/icechunk/tests/test_virtual_refs.rs +++ b/icechunk/tests/test_virtual_refs.rs @@ -21,7 +21,8 @@ use icechunk::{ repository::VersionInfo, session::{SessionErrorKind, get_chunk}, storage::{ - self, ConcurrencySettings, ETag, ObjectStorage, new_s3_storage, s3::mk_client, + self, ConcurrencySettings, ETag, ObjectStorage, backends::s3::mk_client, + new_s3_storage, }, store::{StoreError, StoreErrorKind}, virtual_chunks::VirtualChunkContainer,