Route S3 URLs through file source system instead of bypass

dannon · dannon · commit d40e20f9034c · 2025-09-29T18:07:15.000-04:00
- Enable s3fs as stock file source for DRS S3 URLs
  - Fix DRS tests with proper mock endpoints
diff --git a/lib/galaxy/files/__init__.py b/lib/galaxy/files/__init__.py
@@ -162,6 +162,8 @@ def _ensure_loaded(plugin_type):
             _ensure_loaded("base64")
             _ensure_loaded("drs")
             _ensure_loaded("remoteZip")
+            # Do we actually want to do this here, if we're doing drs+s3fs?
+            _ensure_loaded("s3fs")
 
             if file_sources_config.ftp_upload_dir is not None:
                 _ensure_loaded("gxftp")
diff --git a/lib/galaxy/files/sources/util.py b/lib/galaxy/files/sources/util.py
@@ -40,8 +40,11 @@ def _not_implemented(drs_uri: str, desc: str) -> NotImplementedError:
     if "s3" in desc.lower():
         rest_of_message = """For S3 access methods, this DRS resource uses AWS S3 storage.
 
-        Most research data repositories require AWS credentials for S3 access:
-        - Public datasets: May allow anonymous access via configured S3 file source
+        S3 URLs are now handled through Galaxy's file source system. If you're seeing this error,
+        it means no configured S3 file source can handle the S3 URLs returned by this DRS service.
+
+        Most research data repositories require specific AWS credentials for S3 access:
+        - Public datasets: May work with anonymous S3 file source (anon: true)
         - Controlled access: Requires specific AWS credentials/permissions
         - SPARC datasets: Use "Requester Pays" model (user pays ~$0.09/GB)
 
@@ -55,8 +58,8 @@ def _not_implemented(drs_uri: str, desc: str) -> NotImplementedError:
             secret: YOUR_AWS_SECRET_KEY
             id: s3_research_data
 
-        Note: Some datasets (like SPARC) require RequestPayer='requester' parameter
-        which is not currently supported by Galaxy's S3 file source.
+        Galaxy includes a stock S3 file source for basic anonymous access, but it may not
+        work with all S3 buckets depending on their access policies.
         """
     else:
         rest_of_message = """Currently Galaxy client only works with HTTP/HTTPS targets but extensions for
@@ -121,14 +124,15 @@ def _download_s3_file(s3_url: str, target_path: StrPath, headers: Optional[dict]
             response = requests.get(s3_url, headers=headers or {}, timeout=DEFAULT_SOCKET_TIMEOUT, stream=True)
             response.raise_for_status()
 
-            with open(target_path, 'wb') as f:
+            with open(target_path, "wb") as f:
                 for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
                     f.write(chunk)
             return
 
         # For raw S3 URLs, try s3fs with different access patterns
         log.debug(f"Using s3fs for S3 URL: {s3_url}")
         import s3fs
+
         s3_path = s3_url[5:]  # Remove 's3://' prefix
 
         # Try different S3 access methods in order of preference
@@ -142,8 +146,8 @@ def _download_s3_file(s3_url: str, target_path: StrPath, headers: Optional[dict]
         for method_name, fs_factory in access_methods:
             try:
                 fs = fs_factory()
-                with fs.open(s3_path, 'rb') as s3_file:
-                    with open(target_path, 'wb') as local_file:
+                with fs.open(s3_path, "rb") as s3_file:
+                    with open(target_path, "wb") as local_file:
                         while True:
                             chunk = s3_file.read(CHUNK_SIZE)
                             if not chunk:
@@ -385,27 +389,20 @@ def fetch_drs_to_file(
             opts.extra_props = PartialFilesSourceProperties(**extra_props)
 
         try:
-            # Handle S3 URLs directly using s3fs instead of going through file sources
-            if access_url.startswith("s3://"):
-                log.debug(f"Handling S3 URL directly: {access_url}")
-                _download_s3_file(access_url, target_path, access_headers)
-                downloaded = True
-                break
-            else:
-                file_sources = (
-                    user_context.file_sources
-                    if user_context
-                    else ConfiguredFileSources.from_dict(None, load_stock_plugins=True)
-                )
-                stream_url_to_file(
-                    access_url,
-                    target_path=str(target_path),
-                    file_sources=file_sources,
-                    user_context=user_context,
-                    file_source_opts=opts,
-                )
-                downloaded = True
-                break
+            file_sources = (
+                user_context.file_sources
+                if user_context
+                else ConfiguredFileSources.from_dict(None, load_stock_plugins=True)
+            )
+            stream_url_to_file(
+                access_url,
+                target_path=str(target_path),
+                file_sources=file_sources,
+                user_context=user_context,
+                file_source_opts=opts,
+            )
+            downloaded = True
+            break
         except exceptions.RequestParameterInvalidException as e:
             log.debug(f"Failed to fetch via {access_method['type']} access method: {e}")
             continue
diff --git a/test/unit/files/test_drs.py b/test/unit/files/test_drs.py
@@ -38,13 +38,25 @@ def drs_repo_handler(request):
         }
         return (200, {}, json.dumps(data))
 
+    def access_handler(request):
+        assert request.headers["Authorization"] == "Bearer IBearTokens"
+        access_data = {"url": "https://my.respository.org/myfile.txt", "headers": ["Authorization: Basic Z2E0Z2g6ZHJz"]}
+        return (200, {}, json.dumps(access_data))
+
     responses.add_callback(
         responses.GET,
         "https://drs.example.org/ga4gh/drs/v1/objects/314159",
         callback=drs_repo_handler,
         content_type="application/json",
     )
 
+    responses.add_callback(
+        responses.GET,
+        "https://drs.example.org/ga4gh/drs/v1/objects/314159/access/1234",
+        callback=access_handler,
+        content_type="application/json",
+    )
+
     test_url = "drs://drs.example.org/314159"
 
     def check_specific_header(request, **kwargs):
@@ -86,13 +98,25 @@ def drs_repo_handler(request):
         }
         return (200, {}, json.dumps(data))
 
+    def access_handler(request):
+        assert request.headers["Authorization"] == "Bearer IBearTokens"
+        access_data = {"url": "s3://ga4gh-demo-data/phenopackets/Cao-2018-TGFBR2-Patient_4.json", "headers": []}
+        return (200, {}, json.dumps(access_data))
+
     responses.add_callback(
         responses.GET,
         "https://drs.example.org/ga4gh/drs/v1/objects/314160",
         callback=drs_repo_handler,
         content_type="application/json",
     )
 
+    responses.add_callback(
+        responses.GET,
+        "https://drs.example.org/ga4gh/drs/v1/objects/314160/access/1234",
+        callback=access_handler,
+        content_type="application/json",
+    )
+
     test_url = "drs://drs.example.org/314160"
     file_sources = configured_file_sources(FILE_SOURCES_CONF)
     user_context = user_context_fixture(file_sources=file_sources)
@@ -101,6 +125,16 @@ def drs_repo_handler(request):
     assert file_source_pair.path == test_url
     assert file_source_pair.file_source.id == "test1"
 
-    assert_realizes_contains(
-        file_sources, test_url, "PMID:30101859-Cao-2018-TGFBR2-Patient_4", user_context=user_context
-    )
+    # Mock the S3 file source realize_to method to return test content
+    def mock_s3_realize_to(source_path, native_path, user_context=None, opts=None):
+        with open(native_path, "w") as f:
+            f.write("PMID:30101859-Cao-2018-TGFBR2-Patient_4 test data")
+
+    # Find the S3 file source and patch it
+    for fs in file_sources._file_sources:
+        if fs.plugin_type == "s3fs":
+            with mock.patch.object(fs, "realize_to", side_effect=mock_s3_realize_to):
+                assert_realizes_contains(
+                    file_sources, test_url, "PMID:30101859-Cao-2018-TGFBR2-Patient_4", user_context=user_context
+                )
+            break