|
| 1 | +import hashlib |
| 2 | +import os |
| 3 | +import tempfile |
| 4 | +from pathlib import Path |
| 5 | +from typing import Optional, Union |
| 6 | + |
| 7 | +from policyengine_core.tools.google_cloud import parse_gs_url |
| 8 | + |
| 9 | + |
| 10 | +def materialize_gcs_dataset_url( |
| 11 | + dataset_url: str, |
| 12 | + *, |
| 13 | + cache_dir: Optional[Union[str, os.PathLike]] = None, |
| 14 | +) -> str: |
| 15 | + """Download a GCS dataset URL to a local H5 path and return that path.""" |
| 16 | + bucket_name, file_path, revision = parse_gs_url(dataset_url) |
| 17 | + storage_client = _get_storage_client() |
| 18 | + blob = _resolve_gcs_blob(storage_client, bucket_name, file_path, revision) |
| 19 | + generation = _blob_generation(blob) |
| 20 | + |
| 21 | + local_path = _cached_dataset_path( |
| 22 | + bucket_name=bucket_name, |
| 23 | + file_path=file_path, |
| 24 | + generation=generation, |
| 25 | + cache_dir=cache_dir, |
| 26 | + ) |
| 27 | + if not local_path.exists(): |
| 28 | + _download_blob(blob, local_path) |
| 29 | + return str(local_path) |
| 30 | + |
| 31 | + |
| 32 | +def _get_storage_client(): |
| 33 | + try: |
| 34 | + import google.auth |
| 35 | + from google.auth import exceptions as auth_exceptions |
| 36 | + from google.cloud import storage |
| 37 | + except ImportError as exc: |
| 38 | + raise ImportError( |
| 39 | + "google-cloud-storage is required for gs:// dataset URLs. " |
| 40 | + "Install it with: pip install google-cloud-storage" |
| 41 | + ) from exc |
| 42 | + |
| 43 | + try: |
| 44 | + credentials, project_id = google.auth.default() |
| 45 | + except auth_exceptions.DefaultCredentialsError as exc: |
| 46 | + raise RuntimeError( |
| 47 | + "Google Cloud credentials are required for gs:// dataset URLs. " |
| 48 | + "Set application default credentials or GOOGLE_APPLICATION_CREDENTIALS." |
| 49 | + ) from exc |
| 50 | + |
| 51 | + return storage.Client(credentials=credentials, project=project_id) |
| 52 | + |
| 53 | + |
| 54 | +def _resolve_gcs_blob( |
| 55 | + storage_client, |
| 56 | + bucket_name: str, |
| 57 | + file_path: str, |
| 58 | + revision: Optional[str], |
| 59 | +): |
| 60 | + bucket = storage_client.bucket(bucket_name) |
| 61 | + |
| 62 | + if revision is not None and revision.isdigit(): |
| 63 | + blob = bucket.blob(file_path, generation=int(revision)) |
| 64 | + blob.reload() |
| 65 | + return blob |
| 66 | + |
| 67 | + current_blob = bucket.blob(file_path) |
| 68 | + current_blob.reload() |
| 69 | + if revision is None or _blob_metadata_version(current_blob) == revision: |
| 70 | + return current_blob |
| 71 | + |
| 72 | + matching_blobs = [] |
| 73 | + for blob in storage_client.list_blobs( |
| 74 | + bucket_name, |
| 75 | + prefix=file_path, |
| 76 | + versions=True, |
| 77 | + ): |
| 78 | + if blob.name != file_path: |
| 79 | + continue |
| 80 | + if _blob_metadata_version(blob) == revision: |
| 81 | + matching_blobs.append(blob) |
| 82 | + |
| 83 | + if not matching_blobs: |
| 84 | + raise ValueError( |
| 85 | + f"No GCS object version for gs://{bucket_name}/{file_path} has " |
| 86 | + f"metadata version {revision!r}." |
| 87 | + ) |
| 88 | + |
| 89 | + return max(matching_blobs, key=lambda blob: int(_blob_generation(blob))) |
| 90 | + |
| 91 | + |
| 92 | +def _blob_metadata_version(blob) -> Optional[str]: |
| 93 | + if getattr(blob, "metadata", None) is None: |
| 94 | + blob.reload() |
| 95 | + metadata = getattr(blob, "metadata", None) or {} |
| 96 | + return metadata.get("version") |
| 97 | + |
| 98 | + |
| 99 | +def _blob_generation(blob) -> str: |
| 100 | + generation = getattr(blob, "generation", None) |
| 101 | + if generation is None: |
| 102 | + blob.reload() |
| 103 | + generation = getattr(blob, "generation", None) |
| 104 | + if generation is None: |
| 105 | + raise ValueError(f"GCS object {blob.name!r} does not expose a generation.") |
| 106 | + return str(generation) |
| 107 | + |
| 108 | + |
| 109 | +def _cached_dataset_path( |
| 110 | + *, |
| 111 | + bucket_name: str, |
| 112 | + file_path: str, |
| 113 | + generation: str, |
| 114 | + cache_dir: Optional[Union[str, os.PathLike]], |
| 115 | +) -> Path: |
| 116 | + if cache_dir is None: |
| 117 | + cache_dir = Path(tempfile.gettempdir()) / "policyengine-uk-datasets" |
| 118 | + else: |
| 119 | + cache_dir = Path(cache_dir) |
| 120 | + |
| 121 | + cache_key = hashlib.sha256( |
| 122 | + f"{bucket_name}\0{file_path}\0{generation}".encode() |
| 123 | + ).hexdigest() |
| 124 | + return cache_dir / cache_key / Path(file_path).name |
| 125 | + |
| 126 | + |
| 127 | +def _download_blob(blob, local_path: Path) -> None: |
| 128 | + local_path.parent.mkdir(parents=True, exist_ok=True) |
| 129 | + fd, temporary_path_name = tempfile.mkstemp( |
| 130 | + prefix=f".{local_path.name}.", |
| 131 | + suffix=".tmp", |
| 132 | + dir=local_path.parent, |
| 133 | + ) |
| 134 | + os.close(fd) |
| 135 | + temporary_path = Path(temporary_path_name) |
| 136 | + try: |
| 137 | + blob.download_to_filename(str(temporary_path)) |
| 138 | + os.replace(temporary_path, local_path) |
| 139 | + finally: |
| 140 | + temporary_path.unlink(missing_ok=True) |
0 commit comments