Skip to content

Commit 298521a

Browse files
committed
Add DRS compact identifier support
Support compact identifiers (e.g., drs://drs.anv0:object-id) by resolving prefixes via identifiers.org. Includes caching and tests.
1 parent a1f2432 commit 298521a

File tree

3 files changed

+420
-10
lines changed

3 files changed

+420
-10
lines changed

lib/galaxy/files/sources/util.py

Lines changed: 155 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,13 @@
1+
import json
2+
import logging
13
import time
2-
from typing import Optional
4+
from typing import (
5+
Dict,
6+
List,
7+
Optional,
8+
Tuple,
9+
)
10+
from urllib.parse import quote
311

412
from galaxy import exceptions
513
from galaxy.files import (
@@ -18,6 +26,8 @@
1826
from galaxy.util.config_parsers import IpAllowedListEntryT
1927
from galaxy.util.path import StrPath
2028

29+
log = logging.getLogger(__name__)
30+
2131

2232
def _not_implemented(drs_uri: str, desc: str) -> NotImplementedError:
2333
missing_client_func = f"Galaxy client cannot currently fetch URIs {desc}."
@@ -72,6 +82,134 @@ def _get_access_info(obj_url: str, access_method: dict, headers=None) -> tuple[s
7282
return url, headers_as_dict
7383

7484

85+
class CompactIdentifierResolver:
86+
_instance: Optional["CompactIdentifierResolver"] = None
87+
88+
def __new__(cls, *args, **kwargs):
89+
if cls._instance is None:
90+
cls._instance = super().__new__(cls)
91+
return cls._instance
92+
93+
def __init__(self, cache_ttl: int = 86400):
94+
if not hasattr(self, "_cache"):
95+
self._cache: Dict[str, Dict] = {}
96+
self._cache_ttl = cache_ttl
97+
98+
@classmethod
99+
def _reset_singleton(cls):
100+
"""Reset the singleton instance - for testing only."""
101+
cls._instance = None
102+
103+
def _is_cached(self, prefix: str) -> bool:
104+
if prefix not in self._cache:
105+
return False
106+
cached_time = self._cache[prefix].get("timestamp", 0)
107+
return (time.time() - cached_time) < self._cache_ttl
108+
109+
def _cache_result(self, prefix: str, url_pattern: str):
110+
self._cache[prefix] = {"url_pattern": url_pattern, "timestamp": time.time()}
111+
112+
def _query_identifiers_org(self, prefix: str) -> Optional[str]:
113+
try:
114+
namespace_url = (
115+
f"https://registry.api.identifiers.org/restApi/namespaces/search/findByPrefix?prefix={prefix}"
116+
)
117+
response = requests.get(namespace_url, timeout=DEFAULT_SOCKET_TIMEOUT)
118+
response.raise_for_status()
119+
120+
namespace_data = response.json()
121+
if not namespace_data or "_links" not in namespace_data:
122+
return None
123+
124+
if "resources" in namespace_data["_links"]:
125+
resources_url = namespace_data["_links"]["resources"]["href"]
126+
else:
127+
return None
128+
response = requests.get(resources_url, timeout=DEFAULT_SOCKET_TIMEOUT)
129+
response.raise_for_status()
130+
131+
resources = response.json()
132+
if "_embedded" in resources and "resources" in resources["_embedded"]:
133+
official_resource = None
134+
fallback_resource = None
135+
136+
for resource in resources["_embedded"]["resources"]:
137+
if "urlPattern" in resource:
138+
if resource.get("official", False):
139+
official_resource = resource
140+
break
141+
elif fallback_resource is None:
142+
fallback_resource = resource
143+
144+
best_resource = official_resource or fallback_resource
145+
if best_resource:
146+
return best_resource["urlPattern"]
147+
148+
except requests.exceptions.RequestException as e:
149+
log.warning(f"Failed to query identifiers.org for prefix {prefix}: {e}")
150+
except (KeyError, json.JSONDecodeError) as e:
151+
log.warning(f"Invalid response from identifiers.org for prefix {prefix}: {e}")
152+
153+
return None
154+
155+
def resolve_prefix(self, prefix: str) -> Optional[str]:
156+
if self._is_cached(prefix):
157+
return self._cache[prefix]["url_pattern"]
158+
159+
url_pattern = self._query_identifiers_org(prefix)
160+
161+
if url_pattern:
162+
self._cache_result(prefix, url_pattern)
163+
log.info(f"Resolved DRS prefix '{prefix}' to URL pattern: {url_pattern}")
164+
else:
165+
log.warning(f"Could not resolve DRS prefix '{prefix}' via identifiers.org")
166+
167+
return url_pattern
168+
169+
170+
def parse_compact_identifier(drs_uri: str) -> Tuple[str, str]:
171+
if not drs_uri.startswith("drs://"):
172+
raise ValueError(f"Not a valid DRS URI: {drs_uri}")
173+
174+
rest_of_uri = drs_uri[len("drs://") :]
175+
176+
colon_idx = rest_of_uri.find(":")
177+
if colon_idx == -1:
178+
raise ValueError(f"Invalid compact identifier format (missing colon): {drs_uri}")
179+
180+
prefix = rest_of_uri[:colon_idx]
181+
accession = rest_of_uri[colon_idx + 1 :]
182+
183+
if not all(c.islower() or c.isdigit() or c in "._" for c in prefix):
184+
raise ValueError(
185+
f"Invalid prefix format '{prefix}': must contain only lowercase letters, numbers, dots, and underscores"
186+
)
187+
188+
if not prefix or not accession:
189+
raise ValueError(f"Empty prefix or accession in compact identifier: {drs_uri}")
190+
191+
return prefix, accession
192+
193+
194+
def resolve_compact_identifier_to_url(drs_uri: str, resolver: Optional[CompactIdentifierResolver] = None) -> str:
195+
prefix, accession = parse_compact_identifier(drs_uri)
196+
197+
if resolver is None:
198+
resolver = CompactIdentifierResolver()
199+
200+
url_pattern = resolver.resolve_prefix(prefix)
201+
if not url_pattern:
202+
raise ValueError(f"Could not resolve prefix '{prefix}' via identifiers.org")
203+
204+
encoded_accession = quote(accession, safe="")
205+
resolved_url = url_pattern.replace("{$id}", encoded_accession)
206+
207+
if not resolved_url.startswith(("http://", "https://")):
208+
raise ValueError(f"Resolved URL is not HTTP(S): {resolved_url}")
209+
210+
return resolved_url
211+
212+
75213
def fetch_drs_to_file(
76214
drs_uri: str,
77215
target_path: StrPath,
@@ -84,16 +222,23 @@ def fetch_drs_to_file(
84222
"""Fetch contents of drs:// URI to a target path."""
85223
if not drs_uri.startswith("drs://"):
86224
raise ValueError(f"Unknown scheme for drs_uri {drs_uri}")
225+
87226
rest_of_drs_uri = drs_uri[len("drs://") :]
88-
if "/" not in rest_of_drs_uri:
89-
# DRS URI uses compact identifiers, not yet implemented.
90-
# https://ga4gh.github.io/data-repository-service-schemas/preview/release/drs-1.2.0/docs/more-background-on-compact-identifiers.html
91-
raise _not_implemented(drs_uri, "that use compact identifiers")
92-
netspec, object_id = rest_of_drs_uri.split("/", 1)
93-
scheme = "https"
94-
if force_http:
95-
scheme = "http"
96-
get_url = f"{scheme}://{netspec}/ga4gh/drs/v1/objects/{object_id}"
227+
228+
if "/" not in rest_of_drs_uri and ":" in rest_of_drs_uri:
229+
try:
230+
get_url = resolve_compact_identifier_to_url(drs_uri)
231+
log.info(f"Resolved compact identifier DRS URI {drs_uri} to {get_url}")
232+
except ValueError as e:
233+
raise ValueError(f"Failed to resolve compact identifier DRS URI {drs_uri}: {str(e)}")
234+
elif "/" in rest_of_drs_uri:
235+
netspec, object_id = rest_of_drs_uri.split("/", 1)
236+
scheme = "https"
237+
if force_http:
238+
scheme = "http"
239+
get_url = f"{scheme}://{netspec}/ga4gh/drs/v1/objects/{object_id}"
240+
else:
241+
raise ValueError(f"Invalid DRS URI format: {drs_uri}")
97242
response = retry_and_get(get_url, retry_options or RetryOptions(), headers=headers)
98243
response.raise_for_status()
99244
response_object = response.json()
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
import os
2+
import tempfile
3+
from unittest.mock import (
4+
Mock,
5+
patch,
6+
)
7+
8+
import pytest
9+
10+
from galaxy.files.sources.util import fetch_drs_to_file
11+
12+
13+
class TestDRSCompactIdentifiersIntegration:
14+
@patch("galaxy.files.sources.util.requests.get")
15+
@patch("galaxy.files.sources.util.stream_url_to_file")
16+
def test_fetch_compact_identifier_drs_file(self, mock_stream, mock_get):
17+
namespace_response = Mock()
18+
namespace_response.json.return_value = {
19+
"_links": {
20+
"self": {"href": "https://registry.api.identifiers.org/restApi/namespaces/123"},
21+
"resources": {"href": "https://registry.api.identifiers.org/restApi/namespaces/123/resources"},
22+
}
23+
}
24+
namespace_response.raise_for_status.return_value = None
25+
26+
resources_response = Mock()
27+
resources_response.json.return_value = {
28+
"_embedded": {
29+
"resources": [{"urlPattern": "https://example-drs.com/ga4gh/drs/v1/objects/{$id}", "official": True}]
30+
}
31+
}
32+
resources_response.raise_for_status.return_value = None
33+
34+
drs_response = Mock()
35+
drs_response.json.return_value = {
36+
"id": "test-object-id",
37+
"access_methods": [
38+
{
39+
"type": "https",
40+
"access_url": {
41+
"url": "https://download.example.com/test-file.txt",
42+
"headers": ["Authorization: Bearer test-token"],
43+
},
44+
}
45+
],
46+
}
47+
drs_response.raise_for_status.return_value = None
48+
drs_response.status_code = 200
49+
50+
mock_get.side_effect = [namespace_response, resources_response, drs_response]
51+
52+
with tempfile.NamedTemporaryFile(delete=False) as tmp:
53+
try:
54+
fetch_drs_to_file("drs://drs.test:test-object-id", tmp.name, None)
55+
56+
assert mock_get.call_count == 3
57+
mock_get.assert_any_call(
58+
"https://registry.api.identifiers.org/restApi/namespaces/search/findByPrefix?prefix=drs.test",
59+
timeout=600,
60+
)
61+
mock_get.assert_any_call(
62+
"https://registry.api.identifiers.org/restApi/namespaces/123/resources",
63+
timeout=600,
64+
)
65+
66+
mock_get.assert_any_call(
67+
"https://example-drs.com/ga4gh/drs/v1/objects/test-object-id", timeout=600, headers=None
68+
)
69+
70+
mock_stream.assert_called_once()
71+
call_args = mock_stream.call_args
72+
assert call_args[0][0] == "https://download.example.com/test-file.txt"
73+
assert call_args[1]["target_path"] == tmp.name
74+
75+
finally:
76+
if os.path.exists(tmp.name):
77+
os.unlink(tmp.name)
78+
79+
def test_compact_identifier_error_handling(self):
80+
invalid_uris = [
81+
"drs://", # No identifier
82+
"drs://no-colon", # Missing colon
83+
"drs://:only-accession", # Missing prefix
84+
"drs://prefix:", # Missing accession
85+
"drs://UPPERCASE:accession", # Invalid prefix format
86+
"https://not-drs:accession", # Wrong scheme
87+
]
88+
89+
for uri in invalid_uris:
90+
with pytest.raises(ValueError):
91+
fetch_drs_to_file(uri, "/tmp/output", None)
92+
93+
@patch("galaxy.files.sources.util.CompactIdentifierResolver._query_identifiers_org")
94+
def test_meta_resolver_failure_handling(self, mock_identifiers):
95+
mock_identifiers.return_value = None
96+
97+
with pytest.raises(ValueError, match="Failed to resolve compact identifier DRS URI"):
98+
fetch_drs_to_file("drs://unknown.prefix:accession", "/tmp/output", None)

0 commit comments

Comments
 (0)