Avoid outerjoins when matching dataset hashes

mvdbeek · mvdbeek · commit 177b66f19087 · 2025-11-04T13:06:01.000+01:00
diff --git a/lib/galaxy/managers/jobs.py b/lib/galaxy/managers/jobs.py
@@ -177,22 +177,29 @@ def has_same_hash(
 ) -> "Select[tuple[int]]":
     a_hash = aliased(model.DatasetHash)
     b_hash = aliased(model.DatasetHash)
-    stmt = (
-        stmt.outerjoin(a_hash, a.dataset_id == a_hash.dataset_id)
-        .outerjoin(
-            b_hash,
-            and_(
-                a_hash.hash_function == b_hash.hash_function,
-                a_hash.hash_value == b_hash.hash_value,
-            ),
-        )
-        .join(
-            b,
-            or_(
-                b.dataset_id == a.dataset_id,
-                b_hash.dataset_id == b.dataset_id,
+    # Join b directly, checking for either direct dataset match or hash match
+    # The hash match uses a correlated subquery to avoid the expensive cartesian product
+    # of the original outerjoin approach
+    stmt = stmt.join(
+        b,
+        or_(
+            # Direct dataset match
+            b.dataset_id == a.dataset_id,
+            # Hash match: b's dataset has a hash that matches any of a's hashes
+            # This preserves the original behavior where both datasets must have hashes to match
+            b.dataset_id.in_(
+                select(b_hash.dataset_id)
+                .select_from(a_hash)
+                .join(
+                    b_hash,
+                    and_(
+                        a_hash.hash_function == b_hash.hash_function,
+                        a_hash.hash_value == b_hash.hash_value,
+                    ),
+                )
+                .where(a_hash.dataset_id == a.dataset_id)
             ),
-        )
+        ),
     )
     return stmt