Skip to content

Commit d40e20f

Browse files
committed
Route S3 URLs through file source system instead of bypass
- Enable s3fs as stock file source for DRS S3 URLs - Fix DRS tests with proper mock endpoints
1 parent d5096eb commit d40e20f

File tree

3 files changed

+64
-31
lines changed

3 files changed

+64
-31
lines changed

lib/galaxy/files/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,8 @@ def _ensure_loaded(plugin_type):
162162
_ensure_loaded("base64")
163163
_ensure_loaded("drs")
164164
_ensure_loaded("remoteZip")
165+
# Do we actually want to do this here, if we're doing drs+s3fs?
166+
_ensure_loaded("s3fs")
165167

166168
if file_sources_config.ftp_upload_dir is not None:
167169
_ensure_loaded("gxftp")

lib/galaxy/files/sources/util.py

Lines changed: 25 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,11 @@ def _not_implemented(drs_uri: str, desc: str) -> NotImplementedError:
4040
if "s3" in desc.lower():
4141
rest_of_message = """For S3 access methods, this DRS resource uses AWS S3 storage.
4242
43-
Most research data repositories require AWS credentials for S3 access:
44-
- Public datasets: May allow anonymous access via configured S3 file source
43+
S3 URLs are now handled through Galaxy's file source system. If you're seeing this error,
44+
it means no configured S3 file source can handle the S3 URLs returned by this DRS service.
45+
46+
Most research data repositories require specific AWS credentials for S3 access:
47+
- Public datasets: May work with anonymous S3 file source (anon: true)
4548
- Controlled access: Requires specific AWS credentials/permissions
4649
- SPARC datasets: Use "Requester Pays" model (user pays ~$0.09/GB)
4750
@@ -55,8 +58,8 @@ def _not_implemented(drs_uri: str, desc: str) -> NotImplementedError:
5558
secret: YOUR_AWS_SECRET_KEY
5659
id: s3_research_data
5760
58-
Note: Some datasets (like SPARC) require RequestPayer='requester' parameter
59-
which is not currently supported by Galaxy's S3 file source.
61+
Galaxy includes a stock S3 file source for basic anonymous access, but it may not
62+
work with all S3 buckets depending on their access policies.
6063
"""
6164
else:
6265
rest_of_message = """Currently Galaxy client only works with HTTP/HTTPS targets but extensions for
@@ -121,14 +124,15 @@ def _download_s3_file(s3_url: str, target_path: StrPath, headers: Optional[dict]
121124
response = requests.get(s3_url, headers=headers or {}, timeout=DEFAULT_SOCKET_TIMEOUT, stream=True)
122125
response.raise_for_status()
123126

124-
with open(target_path, 'wb') as f:
127+
with open(target_path, "wb") as f:
125128
for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
126129
f.write(chunk)
127130
return
128131

129132
# For raw S3 URLs, try s3fs with different access patterns
130133
log.debug(f"Using s3fs for S3 URL: {s3_url}")
131134
import s3fs
135+
132136
s3_path = s3_url[5:] # Remove 's3://' prefix
133137

134138
# Try different S3 access methods in order of preference
@@ -142,8 +146,8 @@ def _download_s3_file(s3_url: str, target_path: StrPath, headers: Optional[dict]
142146
for method_name, fs_factory in access_methods:
143147
try:
144148
fs = fs_factory()
145-
with fs.open(s3_path, 'rb') as s3_file:
146-
with open(target_path, 'wb') as local_file:
149+
with fs.open(s3_path, "rb") as s3_file:
150+
with open(target_path, "wb") as local_file:
147151
while True:
148152
chunk = s3_file.read(CHUNK_SIZE)
149153
if not chunk:
@@ -385,27 +389,20 @@ def fetch_drs_to_file(
385389
opts.extra_props = PartialFilesSourceProperties(**extra_props)
386390

387391
try:
388-
# Handle S3 URLs directly using s3fs instead of going through file sources
389-
if access_url.startswith("s3://"):
390-
log.debug(f"Handling S3 URL directly: {access_url}")
391-
_download_s3_file(access_url, target_path, access_headers)
392-
downloaded = True
393-
break
394-
else:
395-
file_sources = (
396-
user_context.file_sources
397-
if user_context
398-
else ConfiguredFileSources.from_dict(None, load_stock_plugins=True)
399-
)
400-
stream_url_to_file(
401-
access_url,
402-
target_path=str(target_path),
403-
file_sources=file_sources,
404-
user_context=user_context,
405-
file_source_opts=opts,
406-
)
407-
downloaded = True
408-
break
392+
file_sources = (
393+
user_context.file_sources
394+
if user_context
395+
else ConfiguredFileSources.from_dict(None, load_stock_plugins=True)
396+
)
397+
stream_url_to_file(
398+
access_url,
399+
target_path=str(target_path),
400+
file_sources=file_sources,
401+
user_context=user_context,
402+
file_source_opts=opts,
403+
)
404+
downloaded = True
405+
break
409406
except exceptions.RequestParameterInvalidException as e:
410407
log.debug(f"Failed to fetch via {access_method['type']} access method: {e}")
411408
continue

test/unit/files/test_drs.py

Lines changed: 37 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,13 +38,25 @@ def drs_repo_handler(request):
3838
}
3939
return (200, {}, json.dumps(data))
4040

41+
def access_handler(request):
42+
assert request.headers["Authorization"] == "Bearer IBearTokens"
43+
access_data = {"url": "https://my.respository.org/myfile.txt", "headers": ["Authorization: Basic Z2E0Z2g6ZHJz"]}
44+
return (200, {}, json.dumps(access_data))
45+
4146
responses.add_callback(
4247
responses.GET,
4348
"https://drs.example.org/ga4gh/drs/v1/objects/314159",
4449
callback=drs_repo_handler,
4550
content_type="application/json",
4651
)
4752

53+
responses.add_callback(
54+
responses.GET,
55+
"https://drs.example.org/ga4gh/drs/v1/objects/314159/access/1234",
56+
callback=access_handler,
57+
content_type="application/json",
58+
)
59+
4860
test_url = "drs://drs.example.org/314159"
4961

5062
def check_specific_header(request, **kwargs):
@@ -86,13 +98,25 @@ def drs_repo_handler(request):
8698
}
8799
return (200, {}, json.dumps(data))
88100

101+
def access_handler(request):
102+
assert request.headers["Authorization"] == "Bearer IBearTokens"
103+
access_data = {"url": "s3://ga4gh-demo-data/phenopackets/Cao-2018-TGFBR2-Patient_4.json", "headers": []}
104+
return (200, {}, json.dumps(access_data))
105+
89106
responses.add_callback(
90107
responses.GET,
91108
"https://drs.example.org/ga4gh/drs/v1/objects/314160",
92109
callback=drs_repo_handler,
93110
content_type="application/json",
94111
)
95112

113+
responses.add_callback(
114+
responses.GET,
115+
"https://drs.example.org/ga4gh/drs/v1/objects/314160/access/1234",
116+
callback=access_handler,
117+
content_type="application/json",
118+
)
119+
96120
test_url = "drs://drs.example.org/314160"
97121
file_sources = configured_file_sources(FILE_SOURCES_CONF)
98122
user_context = user_context_fixture(file_sources=file_sources)
@@ -101,6 +125,16 @@ def drs_repo_handler(request):
101125
assert file_source_pair.path == test_url
102126
assert file_source_pair.file_source.id == "test1"
103127

104-
assert_realizes_contains(
105-
file_sources, test_url, "PMID:30101859-Cao-2018-TGFBR2-Patient_4", user_context=user_context
106-
)
128+
# Mock the S3 file source realize_to method to return test content
129+
def mock_s3_realize_to(source_path, native_path, user_context=None, opts=None):
130+
with open(native_path, "w") as f:
131+
f.write("PMID:30101859-Cao-2018-TGFBR2-Patient_4 test data")
132+
133+
# Find the S3 file source and patch it
134+
for fs in file_sources._file_sources:
135+
if fs.plugin_type == "s3fs":
136+
with mock.patch.object(fs, "realize_to", side_effect=mock_s3_realize_to):
137+
assert_realizes_contains(
138+
file_sources, test_url, "PMID:30101859-Cao-2018-TGFBR2-Patient_4", user_context=user_context
139+
)
140+
break

0 commit comments

Comments
 (0)