Skip to content

Commit 177b66f

Browse files
committed
Avoid outerjoins when matching dataset hashes
1 parent fd5e48d commit 177b66f

File tree

1 file changed

+22
-15
lines changed

1 file changed

+22
-15
lines changed

lib/galaxy/managers/jobs.py

Lines changed: 22 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -177,22 +177,29 @@ def has_same_hash(
177177
) -> "Select[tuple[int]]":
178178
a_hash = aliased(model.DatasetHash)
179179
b_hash = aliased(model.DatasetHash)
180-
stmt = (
181-
stmt.outerjoin(a_hash, a.dataset_id == a_hash.dataset_id)
182-
.outerjoin(
183-
b_hash,
184-
and_(
185-
a_hash.hash_function == b_hash.hash_function,
186-
a_hash.hash_value == b_hash.hash_value,
187-
),
188-
)
189-
.join(
190-
b,
191-
or_(
192-
b.dataset_id == a.dataset_id,
193-
b_hash.dataset_id == b.dataset_id,
180+
# Join b directly, checking for either direct dataset match or hash match
181+
# The hash match uses a correlated subquery to avoid the expensive cartesian product
182+
# of the original outerjoin approach
183+
stmt = stmt.join(
184+
b,
185+
or_(
186+
# Direct dataset match
187+
b.dataset_id == a.dataset_id,
188+
# Hash match: b's dataset has a hash that matches any of a's hashes
189+
# This preserves the original behavior where both datasets must have hashes to match
190+
b.dataset_id.in_(
191+
select(b_hash.dataset_id)
192+
.select_from(a_hash)
193+
.join(
194+
b_hash,
195+
and_(
196+
a_hash.hash_function == b_hash.hash_function,
197+
a_hash.hash_value == b_hash.hash_value,
198+
),
199+
)
200+
.where(a_hash.dataset_id == a.dataset_id)
194201
),
195-
)
202+
),
196203
)
197204
return stmt
198205

0 commit comments

Comments
 (0)