-
Notifications
You must be signed in to change notification settings - Fork 68
Expand file tree
/
Copy pathrepository.py
More file actions
674 lines (564 loc) · 19.8 KB
/
repository.py
File metadata and controls
674 lines (564 loc) · 19.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
import datetime
from collections.abc import AsyncIterator, Iterator
from typing import Any, Self, cast
from icechunk._icechunk_python import (
Diff,
GCSummary,
PyRepository,
RepositoryConfig,
SnapshotInfo,
Storage,
)
from icechunk.credentials import AnyCredential
from icechunk.session import Session
class Repository:
"""An Icechunk repository."""
_repository: PyRepository
def __init__(self, repository: PyRepository):
self._repository = repository
@classmethod
def create(
cls,
storage: Storage,
config: RepositoryConfig | None = None,
virtual_chunk_credentials: dict[str, AnyCredential] | None = None,
) -> Self:
"""
Create a new Icechunk repository.
If one already exists at the given store location, an error will be raised.
!!! warning
Attempting to create a Repo concurrently in the same location from multiple processes is not safe.
Instead, create a Repo once and then open it concurrently.
Parameters
----------
storage : Storage
The storage configuration for the repository.
config : RepositoryConfig, optional
The repository configuration. If not provided, a default configuration will be used.
virtual_chunk_credentials : dict[str, AnyCredential], optional
Credentials for virtual chunks.
Returns
-------
Self
An instance of the Repository class.
"""
return cls(
PyRepository.create(
storage,
config=config,
virtual_chunk_credentials=virtual_chunk_credentials,
)
)
@classmethod
def open(
cls,
storage: Storage,
config: RepositoryConfig | None = None,
virtual_chunk_credentials: dict[str, AnyCredential] | None = None,
) -> Self:
"""
Open an existing Icechunk repository.
If no repository exists at the given storage location, an error will be raised.
!!! warning
This method must be used with care in a multiprocessing context.
Read more in our [Parallel Write Guide](/icechunk-python/parallel#uncooperative-distributed-writes).
Parameters
----------
storage : Storage
The storage configuration for the repository.
config : RepositoryConfig, optional
The repository settings. If not provided, a default configuration will be
loaded from the repository.
virtual_chunk_credentials : dict[str, AnyCredential], optional
Credentials for virtual chunks.
Returns
-------
Self
An instance of the Repository class.
"""
return cls(
PyRepository.open(
storage,
config=config,
virtual_chunk_credentials=virtual_chunk_credentials,
)
)
@classmethod
def open_or_create(
cls,
storage: Storage,
config: RepositoryConfig | None = None,
virtual_chunk_credentials: dict[str, AnyCredential] | None = None,
) -> Self:
"""
Open an existing Icechunk repository or create a new one if it does not exist.
!!! warning
This method must be used with care in a multiprocessing context.
Read more in our [Parallel Write Guide](/icechunk-python/parallel#uncooperative-distributed-writes).
Attempting to create a Repo concurrently in the same location from multiple processes is not safe.
Instead, create a Repo once and then open it concurrently.
Parameters
----------
storage : Storage
The storage configuration for the repository.
config : RepositoryConfig, optional
The repository settings. If not provided, a default configuration will be
loaded from the repository.
virtual_chunk_credentials : dict[str, AnyCredential], optional
Credentials for virtual chunks.
Returns
-------
Self
An instance of the Repository class.
"""
return cls(
PyRepository.open_or_create(
storage,
config=config,
virtual_chunk_credentials=virtual_chunk_credentials,
)
)
@staticmethod
def exists(storage: Storage) -> bool:
"""
Check if a repository exists at the given storage location.
Parameters
----------
storage : Storage
The storage configuration for the repository.
Returns
-------
bool
True if the repository exists, False otherwise.
"""
return PyRepository.exists(storage)
def __getstate__(self) -> object:
return {
"_repository": self._repository.as_bytes(),
}
def __setstate__(self, state: object) -> None:
if not isinstance(state, dict):
raise ValueError("Invalid repository state")
self._repository = PyRepository.from_bytes(state["_repository"])
@staticmethod
def fetch_config(storage: Storage) -> RepositoryConfig | None:
"""
Fetch the configuration for the repository saved in storage.
Parameters
----------
storage : Storage
The storage configuration for the repository.
Returns
-------
RepositoryConfig | None
The repository configuration if it exists, None otherwise.
"""
return PyRepository.fetch_config(storage)
def save_config(self) -> None:
"""
Save the repository configuration to storage, this configuration will be used in future calls to Repository.open.
Returns
-------
None
"""
return self._repository.save_config()
@property
def config(self) -> RepositoryConfig:
"""
Get a copy of this repository's config.
Returns
-------
RepositoryConfig
The repository configuration.
"""
return self._repository.config()
@property
def storage(self) -> Storage:
"""
Get a copy of this repository's Storage instance.
Returns
-------
Storage
The repository storage instance.
"""
return self._repository.storage()
def set_default_commit_metadata(self, metadata: dict[str, Any]) -> None:
"""
Set the default commit metadata for the repository. This is useful for providing
addition static system conexted metadata to all commits.
When a commit is made, the metadata will be merged with the metadata provided, with any
duplicate keys being overwritten by the metadata provided in the commit.
!!! warning
This metadata is only applied to sessions that are created after this call. Any open
writable sessions will not be affected and will not use the new default metadata.
Parameters
----------
metadata : dict[str, Any]
The default commit metadata. Pass an empty dict to clear the default metadata.
"""
return self._repository.set_default_commit_metadata(metadata)
def default_commit_metadata(self) -> dict[str, Any]:
"""
Get the current configured default commit metadata for the repository.
Returns
-------
dict[str, Any]
The default commit metadata.
"""
return self._repository.default_commit_metadata()
def ancestry(
self,
*,
branch: str | None = None,
tag: str | None = None,
snapshot_id: str | None = None,
) -> Iterator[SnapshotInfo]:
"""
Get the ancestry of a snapshot.
Parameters
----------
branch : str, optional
The branch to get the ancestry of.
tag : str, optional
The tag to get the ancestry of.
snapshot_id : str, optional
The snapshot ID to get the ancestry of.
Returns
-------
list[SnapshotInfo]
The ancestry of the snapshot, listing out the snapshots and their metadata.
Notes
-----
Only one of the arguments can be specified.
"""
# the returned object is both an Async and Sync iterator
res = cast(
Iterator[SnapshotInfo],
self._repository.async_ancestry(
branch=branch, tag=tag, snapshot_id=snapshot_id
),
)
return res
def async_ancestry(
self,
*,
branch: str | None = None,
tag: str | None = None,
snapshot_id: str | None = None,
) -> AsyncIterator[SnapshotInfo]:
"""
Get the ancestry of a snapshot.
Parameters
----------
branch : str, optional
The branch to get the ancestry of.
tag : str, optional
The tag to get the ancestry of.
snapshot_id : str, optional
The snapshot ID to get the ancestry of.
Returns
-------
list[SnapshotInfo]
The ancestry of the snapshot, listing out the snapshots and their metadata.
Notes
-----
Only one of the arguments can be specified.
"""
return self._repository.async_ancestry(
branch=branch, tag=tag, snapshot_id=snapshot_id
)
def create_branch(self, branch: str, snapshot_id: str) -> None:
"""
Create a new branch at the given snapshot.
Parameters
----------
branch : str
The name of the branch to create.
snapshot_id : str
The snapshot ID to create the branch at.
Returns
-------
None
"""
self._repository.create_branch(branch, snapshot_id)
def list_branches(self) -> set[str]:
"""
List the branches in the repository.
Returns
-------
set[str]
A set of branch names.
"""
return self._repository.list_branches()
def lookup_branch(self, branch: str) -> str:
"""
Get the tip snapshot ID of a branch.
Parameters
----------
branch : str
The branch to get the tip of.
Returns
-------
str
The snapshot ID of the tip of the branch.
"""
return self._repository.lookup_branch(branch)
def lookup_snapshot(self, snapshot_id: str) -> SnapshotInfo:
"""
Get the SnapshotInfo given a snapshot ID
Parameters
----------
snapshot_id : str
The id of the snapshot to look up
Returns
-------
SnapshotInfo
"""
return self._repository.lookup_snapshot(snapshot_id)
def reset_branch(self, branch: str, snapshot_id: str) -> None:
"""
Reset a branch to a specific snapshot.
This will permanently alter the history of the branch such that the tip of
the branch is the specified snapshot.
Parameters
----------
branch : str
The branch to reset.
snapshot_id : str
The snapshot ID to reset the branch to.
Returns
-------
None
"""
self._repository.reset_branch(branch, snapshot_id)
def delete_branch(self, branch: str) -> None:
"""
Delete a branch.
Parameters
----------
branch : str
The branch to delete.
Returns
-------
None
"""
self._repository.delete_branch(branch)
def delete_tag(self, tag: str) -> None:
"""
Delete a tag.
Parameters
----------
tag : str
The tag to delete.
Returns
-------
None
"""
self._repository.delete_tag(tag)
def create_tag(self, tag: str, snapshot_id: str) -> None:
"""
Create a new tag at the given snapshot.
Parameters
----------
tag : str
The name of the tag to create.
snapshot_id : str
The snapshot ID to create the tag at.
Returns
-------
None
"""
self._repository.create_tag(tag, snapshot_id)
def list_tags(self) -> set[str]:
"""
List the tags in the repository.
Returns
-------
set[str]
A set of tag names.
"""
return self._repository.list_tags()
def lookup_tag(self, tag: str) -> str:
"""
Get the snapshot ID of a tag.
Parameters
----------
tag : str
The tag to get the snapshot ID of.
Returns
-------
str
The snapshot ID of the tag.
"""
return self._repository.lookup_tag(tag)
def diff(
self,
*,
from_branch: str | None = None,
from_tag: str | None = None,
from_snapshot_id: str | None = None,
to_branch: str | None = None,
to_tag: str | None = None,
to_snapshot_id: str | None = None,
) -> Diff:
"""
Compute an overview of the operations executed from version `from` to version `to`.
Both versions, `from` and `to`, must be identified. Identification can be done using a branch, tag or snapshot id.
The styles used to identify the `from` and `to` versions can be different.
The `from` version must be a member of the `ancestry` of `to`.
Returns
-------
Diff
The operations executed between the two versions
"""
return self._repository.diff(
from_branch=from_branch,
from_tag=from_tag,
from_snapshot_id=from_snapshot_id,
to_branch=to_branch,
to_tag=to_tag,
to_snapshot_id=to_snapshot_id,
)
def readonly_session(
self,
branch: str | None = None,
*,
tag: str | None = None,
snapshot_id: str | None = None,
as_of: datetime.datetime | None = None,
) -> Session:
"""
Create a read-only session.
This can be thought of as a read-only checkout of the repository at a given snapshot.
When branch or tag are provided, the session will be based on the tip of the branch or
the snapshot ID of the tag.
Parameters
----------
branch : str, optional
If provided, the branch to create the session on.
tag : str, optional
If provided, the tag to create the session on.
snapshot_id : str, optional
If provided, the snapshot ID to create the session on.
as_of: datetime.datetime, optional
When combined with the branch argument, it will open the session at the last
snapshot that is at or before this datetime
Returns
-------
Session
The read-only session, pointing to the specified snapshot, tag, or branch.
Notes
-----
Only one of the arguments can be specified.
"""
return Session(
self._repository.readonly_session(
branch=branch, tag=tag, snapshot_id=snapshot_id, as_of=as_of
)
)
def writable_session(self, branch: str) -> Session:
"""
Create a writable session on a branch.
Like the read-only session, this can be thought of as a checkout of the repository at the
tip of the branch. However, this session is writable and can be used to make changes to the
repository. When ready, the changes can be committed to the branch, after which the session will
become a read-only session on the new snapshot.
Parameters
----------
branch : str
The branch to create the session on.
Returns
-------
Session
The writable session on the branch.
"""
return Session(self._repository.writable_session(branch))
def expire_snapshots(
self,
older_than: datetime.datetime,
*,
delete_expired_branches: bool = False,
delete_expired_tags: bool = False,
) -> set[str]:
"""Expire all snapshots older than a threshold.
This processes snapshots found by navigating all references in
the repo, tags first, branches leter, both in lexicographical order.
Returns the ids of all snapshots considered expired and skipped
from history. Notice that this snapshot are not necessarily
available for garbage collection, they could still be pointed by
ether refs.
If `delete_expired_*` is set to True, branches or tags that, after the
expiration process, point to expired snapshots directly, will be
deleted.
Danger
------
This is an administrative operation, it should be run
carefully. The repository can still operate concurrently while
`expire_snapshots` runs, but other readers can get inconsistent
views of the repository history.
Parameters
----------
older_than: datetime.datetime
Expire snapshots older than this time.
delete_expired_branches: bool, optional
Whether to delete any branches that now have only expired snapshots.
delete_expired_tags: bool, optional
Whether to delete any tags associated with expired snapshots
Returns
-------
set of expires snapshot IDs
"""
return self._repository.expire_snapshots(
older_than,
delete_expired_branches=delete_expired_branches,
delete_expired_tags=delete_expired_tags,
)
def garbage_collect(self, delete_object_older_than: datetime.datetime) -> GCSummary:
"""Delete any objects no longer accessible from any branches or tags.
Danger
------
This is an administrative operation, it should be run
carefully. The repository can still operate concurrently while
`garbage_collect` runs, but other reades can get inconsistent
views if they are trying to access the expired snapshots.
Parameters
----------
delete_object_older_than: datetime.datetime
Delete objects older than this time.
Returns
-------
GCSummary
Summary of objects deleted.
"""
return self._repository.garbage_collect(delete_object_older_than)
def rewrite_manifests(
self, message: str, *, branch: str, metadata: dict[str, Any] | None = None
) -> str:
"""
Rewrite manifests for all arrays and commit to the specified branch.
Parameters
----------
message : str
The message to write with the commit.
branch: str
The branch to commit to.
metadata : dict[str, Any] | None, optional
Additional metadata to store with the commit snapshot.
Returns
-------
str
The snapshot ID of the new commit.
"""
return self._repository.rewrite_manifests(
message, branch=branch, metadata=metadata
)
def total_chunks_storage(self) -> int:
"""Calculate the total storage used for chunks, in bytes .
It reports the storage needed to store all snapshots in the repository that
are reachable from any branches or tags. Unreachable snapshots can be generated
by using `reset_branch` or `expire_snapshots`. The chunks for these snapshots
are not included in the result, and they should probably be deleted using
`garbage_collection`.
The result includes only native chunks, not adding virtual or inline chunks.
"""
return self._repository.total_chunks_storage()