File tree Expand file tree Collapse file tree 1 file changed +22
-15
lines changed
Expand file tree Collapse file tree 1 file changed +22
-15
lines changed Original file line number Diff line number Diff line change @@ -177,22 +177,29 @@ def has_same_hash(
177177) -> "Select[tuple[int]]" :
178178 a_hash = aliased (model .DatasetHash )
179179 b_hash = aliased (model .DatasetHash )
180- stmt = (
181- stmt .outerjoin (a_hash , a .dataset_id == a_hash .dataset_id )
182- .outerjoin (
183- b_hash ,
184- and_ (
185- a_hash .hash_function == b_hash .hash_function ,
186- a_hash .hash_value == b_hash .hash_value ,
187- ),
188- )
189- .join (
190- b ,
191- or_ (
192- b .dataset_id == a .dataset_id ,
193- b_hash .dataset_id == b .dataset_id ,
180+ # Join b directly, checking for either direct dataset match or hash match
181+ # The hash match uses a correlated subquery to avoid the expensive cartesian product
182+ # of the original outerjoin approach
183+ stmt = stmt .join (
184+ b ,
185+ or_ (
186+ # Direct dataset match
187+ b .dataset_id == a .dataset_id ,
188+ # Hash match: b's dataset has a hash that matches any of a's hashes
189+ # This preserves the original behavior where both datasets must have hashes to match
190+ b .dataset_id .in_ (
191+ select (b_hash .dataset_id )
192+ .select_from (a_hash )
193+ .join (
194+ b_hash ,
195+ and_ (
196+ a_hash .hash_function == b_hash .hash_function ,
197+ a_hash .hash_value == b_hash .hash_value ,
198+ ),
199+ )
200+ .where (a_hash .dataset_id == a .dataset_id )
194201 ),
195- )
202+ ),
196203 )
197204 return stmt
198205
You can’t perform that action at this time.
0 commit comments