@@ -115,6 +115,62 @@ def test_insert_posts_batch_with_duplicates(self, postgres_db):
115115 cur .execute ("DELETE FROM posts WHERE subreddit = 'test_dup'" )
116116 conn .commit ()
117117
118+ def test_insert_posts_batch_within_batch_duplicate_ids (self , postgres_db ):
119+ """A duplicate ID within a single batch must not abort the whole COPY.
120+
121+ Overlapping source dumps repeat the same row inside one batch; the
122+ posts_staging PRIMARY KEY would otherwise fail the entire COPY (taking
123+ every other post in the batch down with it). The duplicate is dropped
124+ and the remaining rows still land.
125+ """
126+ posts = [
127+ {
128+ "id" : "wb_dup_post" ,
129+ "subreddit" : "test_wbdup" ,
130+ "author" : "a" ,
131+ "title" : "First" ,
132+ "created_utc" : 1640000000 ,
133+ "score" : 10 ,
134+ "permalink" : "/r/test_wbdup/comments/wb_dup_post/" ,
135+ "platform" : "reddit" ,
136+ },
137+ {
138+ "id" : "wb_dup_post" , # same ID, same batch
139+ "subreddit" : "test_wbdup" ,
140+ "author" : "a" ,
141+ "title" : "Duplicate" ,
142+ "created_utc" : 1640000000 ,
143+ "score" : 99 ,
144+ "permalink" : "/r/test_wbdup/comments/wb_dup_post/" ,
145+ "platform" : "reddit" ,
146+ },
147+ {
148+ "id" : "wb_unique_post" ,
149+ "subreddit" : "test_wbdup" ,
150+ "author" : "a" ,
151+ "title" : "Unique" ,
152+ "created_utc" : 1640000001 ,
153+ "score" : 5 ,
154+ "permalink" : "/r/test_wbdup/comments/wb_unique_post/" ,
155+ "platform" : "reddit" ,
156+ },
157+ ]
158+
159+ _successful , failed , _failed_ids = postgres_db .insert_posts_batch (posts )
160+
161+ # The whole batch must not be lost to the duplicate: both distinct IDs land.
162+ assert failed == 0
163+ with postgres_db .pool .get_connection () as conn , conn .cursor () as cur :
164+ cur .execute ("SELECT COUNT(DISTINCT id) FROM posts WHERE subreddit = 'test_wbdup'" )
165+ assert cur .fetchone ()["count" ] == 2
166+ cur .execute ("SELECT COUNT(*) FROM posts WHERE id = 'wb_dup_post'" )
167+ assert cur .fetchone ()["count" ] == 1
168+
169+ # Cleanup
170+ with postgres_db .pool .get_connection () as conn , conn .cursor () as cur :
171+ cur .execute ("DELETE FROM posts WHERE subreddit = 'test_wbdup'" )
172+ conn .commit ()
173+
118174 def test_insert_posts_batch_empty_list (self , postgres_db ):
119175 """Test empty batch insertion."""
120176 successful , failed , _failed_ids = postgres_db .insert_posts_batch ([])
@@ -213,6 +269,62 @@ def test_insert_comments_batch_basic(self, postgres_db):
213269 cur .execute ("DELETE FROM posts WHERE id = 'comment_parent_post'" )
214270 conn .commit ()
215271
272+ def test_insert_comments_batch_within_batch_duplicate_ids (self , postgres_db ):
273+ """A duplicate comment ID within a single batch must not abort the whole COPY.
274+
275+ This is the exact failure observed on the Voat searchvoat dump, where
276+ overlapping exports repeated a comment row: comments_staging's PRIMARY
277+ KEY failed the COPY and dropped the entire 1000-row batch. The duplicate
278+ is now skipped and the rest of the batch still lands.
279+ """
280+ parent_post = {
281+ "id" : "wbc_parent_post" ,
282+ "subreddit" : "test_wbcdup" ,
283+ "author" : "post_author" ,
284+ "title" : "Parent" ,
285+ "created_utc" : 1640000000 ,
286+ "score" : 100 ,
287+ "permalink" : "/r/test_wbcdup/comments/wbc_parent_post/" ,
288+ "platform" : "reddit" ,
289+ }
290+ postgres_db .insert_posts_batch ([parent_post ])
291+
292+ def _comment (cid : str , score : int , body : str ) -> dict :
293+ return {
294+ "id" : cid ,
295+ "subreddit" : "test_wbcdup" ,
296+ "author" : "commenter" ,
297+ "body" : body ,
298+ "created_utc" : 1640000100 ,
299+ "score" : score ,
300+ "post_id" : "wbc_parent_post" ,
301+ "link_id" : "t3_wbc_parent_post" ,
302+ "parent_id" : "t3_wbc_parent_post" ,
303+ "permalink" : "/r/test_wbcdup/comments/wbc_parent_post/_/" + cid + "/" ,
304+ "platform" : "reddit" ,
305+ }
306+
307+ comments = [
308+ _comment ("wbc_dup" , 10 , "first" ),
309+ _comment ("wbc_dup" , 99 , "duplicate in same batch" ),
310+ _comment ("wbc_unique" , 5 , "unique" ),
311+ ]
312+
313+ _successful , failed = postgres_db .insert_comments_batch (comments )
314+
315+ assert failed == 0
316+ with postgres_db .pool .get_connection () as conn , conn .cursor () as cur :
317+ cur .execute ("SELECT COUNT(DISTINCT id) FROM comments WHERE subreddit = 'test_wbcdup'" )
318+ assert cur .fetchone ()["count" ] == 2
319+ cur .execute ("SELECT COUNT(*) FROM comments WHERE id = 'wbc_dup'" )
320+ assert cur .fetchone ()["count" ] == 1
321+
322+ # Cleanup
323+ with postgres_db .pool .get_connection () as conn , conn .cursor () as cur :
324+ cur .execute ("DELETE FROM comments WHERE subreddit = 'test_wbcdup'" )
325+ cur .execute ("DELETE FROM posts WHERE id = 'wbc_parent_post'" )
326+ conn .commit ()
327+
216328 def test_insert_comments_batch_empty_list (self , postgres_db ):
217329 """Test empty comment batch insertion."""
218330 successful , failed = postgres_db .insert_comments_batch ([])
0 commit comments