Skip to content

Commit f75d9e7

Browse files
authored
Initial HTTP support for Virtual Refs (#938)
1 parent 5f81e32 commit f75d9e7

File tree

9 files changed

+225
-32
lines changed

9 files changed

+225
-32
lines changed

icechunk-python/python/icechunk/__init__.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,8 @@
7474
AnyObjectStoreConfig,
7575
azure_storage,
7676
gcs_storage,
77+
gcs_store,
78+
http_store,
7779
in_memory_storage,
7880
local_filesystem_storage,
7981
r2_storage,
@@ -144,6 +146,8 @@
144146
"gcs_refreshable_credentials",
145147
"gcs_static_credentials",
146148
"gcs_storage",
149+
"gcs_store",
150+
"http_store",
147151
"in_memory_storage",
148152
"initialize_logs",
149153
"local_filesystem_storage",

icechunk-python/python/icechunk/_icechunk_python.pyi

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,13 +45,16 @@ class ObjectStoreConfig:
4545
def __init__(self, options: S3Options) -> None: ...
4646

4747
class Gcs:
48-
def __init__(self) -> None: ...
48+
def __init__(self, opts: dict[str, str] | None = None) -> None: ...
4949

5050
class Azure:
51-
def __init__(self) -> None: ...
51+
def __init__(self, opts: dict[str, str] | None = None) -> None: ...
5252

5353
class Tigris:
54-
def __init__(self) -> None: ...
54+
def __init__(self, opts: S3Options) -> None: ...
55+
56+
class Http:
57+
def __init__(self, opts: dict[str, str] | None = None) -> None: ...
5558

5659
AnyObjectStoreConfig = (
5760
ObjectStoreConfig.InMemory
@@ -61,6 +64,7 @@ AnyObjectStoreConfig = (
6164
| ObjectStoreConfig.Gcs
6265
| ObjectStoreConfig.Azure
6366
| ObjectStoreConfig.Tigris
67+
| ObjectStoreConfig.Http
6468
)
6569

6670
class VirtualChunkContainer:

icechunk-python/python/icechunk/storage.py

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,19 @@ def local_filesystem_storage(path: str) -> Storage:
4040
return Storage.new_local_filesystem(path)
4141

4242

43+
def http_store(
44+
opts: dict[str, str] | None = None,
45+
) -> ObjectStoreConfig.Http:
46+
"""Build an ObjectStoreConfig instance for HTTP object stores.
47+
48+
Parameters
49+
----------
50+
opts: dict[str, str] | None
51+
A dictionary of options for the HTTP object store. See https://docs.rs/object_store/latest/object_store/client/enum.ClientConfigKey.html#variants for a list of possible keys in snake case format.
52+
"""
53+
return ObjectStoreConfig.Http(opts)
54+
55+
4356
def s3_store(
4457
region: str | None = None,
4558
endpoint_url: str | None = None,
@@ -332,6 +345,19 @@ def r2_storage(
332345
)
333346

334347

348+
def gcs_store(
349+
opts: dict[str, str] | None = None,
350+
) -> ObjectStoreConfig.Gcs:
351+
"""Build an ObjectStoreConfig instance for Google Cloud Storage object stores.
352+
353+
Parameters
354+
----------
355+
opts: dict[str, str] | None
356+
A dictionary of options for the Google Cloud Storage object store. See https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html#variants for a list of possible configuration keys.
357+
"""
358+
return ObjectStoreConfig.Gcs(opts)
359+
360+
335361
def gcs_storage(
336362
*,
337363
bucket: str,
@@ -353,10 +379,18 @@ def gcs_storage(
353379
The bucket where the repository will store its data
354380
prefix: str | None
355381
The prefix within the bucket that is the root directory of the repository
356-
from_env: bool | None
357-
Fetch credentials from the operative system environment
382+
service_account_file: str | None
383+
The path to the service account file
384+
service_account_key: str | None
385+
The service account key
386+
application_credentials: str | None
387+
The path to the application credentials file
358388
bearer_token: str | None
359389
The bearer token to use for the object store
390+
from_env: bool | None
391+
Fetch credentials from the operative system environment
392+
config: dict[str, str] | None
393+
A dictionary of options for the Google Cloud Storage object store. See https://docs.rs/object_store/latest/object_store/gcp/enum.GoogleConfigKey.html#variants for a list of possible configuration keys.
360394
get_credentials: Callable[[], GcsBearerCredential] | None
361395
Use this function to get and refresh object store credentials
362396
scatter_initial_credentials: bool, optional
@@ -412,6 +446,8 @@ def azure_storage(
412446
Azure Blob Storage credential bearer token
413447
from_env: bool | None
414448
Fetch credentials from the operative system environment
449+
config: dict[str, str] | None
450+
A dictionary of options for the Azure Blob Storage object store. See https://docs.rs/object_store/latest/object_store/azure/enum.AzureConfigKey.html#variants for a list of possible configuration keys.
415451
"""
416452
credentials = azure_credentials(
417453
access_key=access_key,

icechunk-python/src/config.rs

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -459,8 +459,9 @@ pub enum PyObjectStoreConfig {
459459
S3Compatible(PyS3Options),
460460
S3(PyS3Options),
461461
Gcs(Option<HashMap<String, String>>),
462-
Azure(HashMap<String, String>),
462+
Azure(Option<HashMap<String, String>>),
463463
Tigris(PyS3Options),
464+
Http(Option<HashMap<String, String>>),
464465
}
465466

466467
impl From<&PyObjectStoreConfig> for ObjectStoreConfig {
@@ -477,8 +478,13 @@ impl From<&PyObjectStoreConfig> for ObjectStoreConfig {
477478
PyObjectStoreConfig::Gcs(opts) => {
478479
ObjectStoreConfig::Gcs(opts.clone().unwrap_or_default())
479480
}
480-
PyObjectStoreConfig::Azure(opts) => ObjectStoreConfig::Azure(opts.clone()),
481+
PyObjectStoreConfig::Azure(opts) => {
482+
ObjectStoreConfig::Azure(opts.clone().unwrap_or_default())
483+
}
481484
PyObjectStoreConfig::Tigris(opts) => ObjectStoreConfig::Tigris(opts.into()),
485+
PyObjectStoreConfig::Http(opts) => {
486+
ObjectStoreConfig::Http(opts.clone().unwrap_or_default())
487+
}
482488
}
483489
}
484490
}
@@ -495,8 +501,9 @@ impl From<ObjectStoreConfig> for PyObjectStoreConfig {
495501
}
496502
ObjectStoreConfig::S3(opts) => PyObjectStoreConfig::S3(opts.into()),
497503
ObjectStoreConfig::Gcs(opts) => PyObjectStoreConfig::Gcs(Some(opts)),
498-
ObjectStoreConfig::Azure(opts) => PyObjectStoreConfig::Azure(opts),
504+
ObjectStoreConfig::Azure(opts) => PyObjectStoreConfig::Azure(Some(opts)),
499505
ObjectStoreConfig::Tigris(opts) => PyObjectStoreConfig::Tigris(opts.into()),
506+
ObjectStoreConfig::Http(opts) => PyObjectStoreConfig::Http(Some(opts)),
500507
}
501508
}
502509
}

icechunk-python/tests/test_virtual_ref.py

Lines changed: 26 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
VirtualChunkContainer,
1717
VirtualChunkSpec,
1818
containers_credentials,
19+
http_store,
1920
in_memory_storage,
2021
local_filesystem_storage,
2122
s3_credentials,
@@ -213,21 +214,33 @@ async def test_write_minio_virtual_refs() -> None:
213214
_snapshot_id = session.commit("Add virtual refs")
214215

215216

216-
async def test_from_s3_public_virtual_refs(tmpdir: Path) -> None:
217+
@pytest.mark.parametrize(
218+
"container_type,url_prefix,store_config",
219+
[
220+
(
221+
"s3",
222+
"s3://earthmover-sample-data",
223+
ObjectStoreConfig.S3(S3Options(region="us-east-1", anonymous=True)),
224+
),
225+
(
226+
"http",
227+
"https://earthmover-sample-data.s3.amazonaws.com",
228+
http_store(),
229+
),
230+
],
231+
)
232+
async def test_public_virtual_refs(
233+
tmpdir: Path,
234+
container_type: str,
235+
url_prefix: str,
236+
store_config: ObjectStoreConfig.S3 | ObjectStoreConfig.Http,
237+
) -> None:
217238
config = RepositoryConfig.default()
218-
store_config = ObjectStoreConfig.S3(
219-
S3Options(
220-
region="us-east-1",
221-
anonymous=True,
222-
)
223-
)
224-
container = VirtualChunkContainer(
225-
"sample-data", "s3://earthmover-sample-data", store_config
226-
)
239+
container = VirtualChunkContainer("sample-data", url_prefix, store_config)
227240
config.set_virtual_chunk_container(container)
228241

229242
repo = Repository.open_or_create(
230-
storage=local_filesystem_storage(f"{tmpdir}/virtual"),
243+
storage=local_filesystem_storage(f"{tmpdir}/virtual-{container_type}"),
231244
config=config,
232245
)
233246
session = repo.writable_session("main")
@@ -238,9 +251,10 @@ async def test_from_s3_public_virtual_refs(tmpdir: Path) -> None:
238251
name="year", shape=((72,)), chunks=((72,)), dtype="float32", compressors=None
239252
)
240253

254+
file_path = f"{url_prefix}/netcdf/oscar_vel2018.nc"
241255
store.set_virtual_ref(
242256
"year/c/0",
243-
"s3://earthmover-sample-data/netcdf/oscar_vel2018.nc",
257+
file_path,
244258
offset=22306,
245259
length=288,
246260
)

icechunk/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ bytes = { version = "1.10.1", features = ["serde"] }
1818
base64 = "0.22.1"
1919
futures = "0.3.31"
2020
itertools = "0.14.0"
21-
object_store = { version = "0.12.2", features = ["aws", "gcp", "azure"] }
21+
object_store = { version = "0.12.2", features = ["aws", "gcp", "azure", "http"] }
2222
rand = "0.9.1"
2323
thiserror = "2.0.12"
2424
serde_json = "1.0.140"

icechunk/src/config.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ impl fmt::Display for S3Options {
5252
pub enum ObjectStoreConfig {
5353
InMemory,
5454
LocalFileSystem(PathBuf),
55+
Http(HashMap<String, String>),
5556
S3Compatible(S3Options),
5657
S3(S3Options),
5758
Gcs(HashMap<String, String>),

icechunk/src/storage/object_store.rs

Lines changed: 83 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,13 @@ use futures::{
1414
stream::{self, BoxStream},
1515
};
1616
use object_store::{
17-
Attribute, AttributeValue, Attributes, BackoffConfig, CredentialProvider, GetOptions,
18-
ObjectMeta, ObjectStore, PutMode, PutOptions, PutPayload, RetryConfig,
19-
StaticCredentialProvider, UpdateVersion,
17+
Attribute, AttributeValue, Attributes, BackoffConfig, ClientConfigKey,
18+
CredentialProvider, GetOptions, ObjectMeta, ObjectStore, PutMode, PutOptions,
19+
PutPayload, RetryConfig, StaticCredentialProvider, UpdateVersion,
2020
aws::AmazonS3Builder,
2121
azure::{AzureConfigKey, MicrosoftAzureBuilder},
2222
gcp::{GcpCredential, GoogleCloudStorageBuilder, GoogleConfigKey},
23+
http::HttpBuilder,
2324
local::LocalFileSystem,
2425
memory::InMemory,
2526
path::Path as ObjectPath,
@@ -278,7 +279,7 @@ impl private::Sealed for ObjectStorage {}
278279
#[typetag::serde]
279280
impl Storage for ObjectStorage {
280281
fn can_write(&self) -> bool {
281-
true
282+
self.backend.can_write()
282283
}
283284

284285
#[instrument(skip_all)]
@@ -653,6 +654,10 @@ pub trait ObjectStoreBackend: Debug + Display + Sync + Send {
653654
}
654655

655656
fn default_settings(&self) -> Settings;
657+
658+
fn can_write(&self) -> bool {
659+
true
660+
}
656661
}
657662

658663
#[derive(Debug, Serialize, Deserialize)]
@@ -756,6 +761,80 @@ impl ObjectStoreBackend for LocalFileSystemObjectStoreBackend {
756761
}
757762
}
758763

764+
#[derive(Debug, Serialize, Deserialize)]
765+
pub struct HttpObjectStoreBackend {
766+
pub url: String,
767+
pub config: Option<HashMap<ClientConfigKey, String>>,
768+
}
769+
770+
impl fmt::Display for HttpObjectStoreBackend {
771+
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
772+
write!(
773+
f,
774+
"HttpObjectStoreBackend(url={}, config={})",
775+
self.url,
776+
self.config
777+
.as_ref()
778+
.map(|c| c
779+
.iter()
780+
.map(|(k, v)| format!("{:?}={}", k, v))
781+
.collect::<Vec<_>>()
782+
.join(", "))
783+
.unwrap_or("None".to_string())
784+
)
785+
}
786+
}
787+
788+
#[typetag::serde(name = "http_object_store_provider")]
789+
impl ObjectStoreBackend for HttpObjectStoreBackend {
790+
fn mk_object_store(
791+
&self,
792+
settings: &Settings,
793+
) -> Result<Arc<dyn ObjectStore>, StorageError> {
794+
let builder = HttpBuilder::new().with_url(&self.url);
795+
796+
// Add options
797+
let builder = self
798+
.config
799+
.as_ref()
800+
.unwrap_or(&HashMap::new())
801+
.iter()
802+
.fold(builder, |builder, (key, value)| builder.with_config(*key, value));
803+
804+
let builder = builder.with_retry(RetryConfig {
805+
backoff: BackoffConfig {
806+
init_backoff: core::time::Duration::from_millis(
807+
settings.retries().initial_backoff_ms() as u64,
808+
),
809+
max_backoff: core::time::Duration::from_millis(
810+
settings.retries().max_backoff_ms() as u64,
811+
),
812+
base: 2.,
813+
},
814+
max_retries: settings.retries().max_tries().get() as usize - 1,
815+
retry_timeout: core::time::Duration::from_secs(5 * 60),
816+
});
817+
818+
let store =
819+
builder.build().map_err(|e| StorageErrorKind::Other(e.to_string()))?;
820+
821+
Ok(Arc::new(store))
822+
}
823+
824+
fn prefix(&self) -> String {
825+
"".to_string()
826+
}
827+
828+
fn default_settings(&self) -> Settings {
829+
Default::default()
830+
}
831+
832+
fn can_write(&self) -> bool {
833+
// TODO: Support write operations?
834+
false
835+
}
836+
}
837+
759838
#[derive(Debug, Serialize, Deserialize)]
760839
pub struct S3ObjectStoreBackend {
761840
bucket: String,

0 commit comments

Comments
 (0)