Merge pull request #79 from 19-84/fix/voat-dup-id-batch-drop

19-84 · web-flow · commit 88c77eaef47f · 2026-06-14T20:55:10.000Z
fix(import): dedupe IDs within COPY batch; relax export/enrich file requirement
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,20 @@ All notable changes to Redd-Archiver will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [Unreleased]
+
+### Fixed
+- Comment/post import dropped an entire COPY batch (up to ~1,000 rows) when the
+  source contained a duplicate ID within that batch — the staging table's
+  PRIMARY KEY aborted the whole COPY, not just the dup. Observed on the Voat
+  searchvoat.co dump (overlapping exports repeat rows): a single `privacy`
+  subverse lost ~850 comments. Duplicate IDs within a batch are now de-duped
+  before COPY and reported; cross-batch repeats were already upserted.
+- `--export-from-database` (and the `--enrich*` modes) no longer require
+  `--comments-file`/`--submissions-file` when a single community is named.
+  Those modes read from the database / metadata dumps, not the source files,
+  so the requirement was spurious.
+
 ## [1.1.0] — 2026-06-12 — "Living Archive"
 
 The archive is no longer a snapshot: serve it three ways, keep it current
diff --git a/core/postgres_database.py b/core/postgres_database.py
@@ -789,6 +789,7 @@ def insert_posts_batch(
         successful = 0
         failed = 0
         skipped = 0  # Track posts without valid IDs
+        duplicates = 0  # Track duplicate IDs within a batch (e.g. overlapping source dumps)
         failed_post_ids: set[str] = set()  # Track which specific posts failed
         total_posts = len(posts)
         current_batch_size = initial_batch_size
@@ -813,6 +814,11 @@ def insert_posts_batch(
                         # This eliminates 350MB buffer overhead per batch
                         copy_buffer = StringIO()
                         records_prepared = 0
+                        # Dedupe IDs within the batch: posts_staging has a PRIMARY KEY,
+                        # so a repeated ID (common when source dumps overlap) aborts the whole
+                        # COPY — not just the dup row. Cross-batch repeats are handled by the
+                        # ON CONFLICT upsert below; only same-batch collisions break COPY.
+                        seen_ids: set[str] = set()
 
                         for post in batch:
                             # Validate post has a valid ID before attempting insertion
@@ -821,6 +827,12 @@ def insert_posts_batch(
                                 skipped += 1
                                 continue
 
+                            id_key = str(post_id)
+                            if id_key in seen_ids:
+                                duplicates += 1
+                                continue
+                            seen_ids.add(id_key)
+
                             try:
                                 sanitized_post = self._sanitize_recursive(post)
                                 json_data = json.dumps(sanitized_post, allow_nan=False)
@@ -948,6 +960,9 @@ def insert_posts_batch(
             if skipped > 0:
                 print_warning(f"Skipped {skipped} posts with missing or empty IDs")
 
+            if duplicates > 0:
+                print_warning(f"Skipped {duplicates} duplicate post IDs within batches (overlapping source data)")
+
             if failed > 0:
                 print_warning(f"Failed to insert {failed} posts ({len(failed_post_ids)} unique IDs tracked)")
 
@@ -986,6 +1001,7 @@ def insert_comments_batch(
         successful = 0
         failed = 0
         skipped = 0  # Track comments without valid IDs
+        duplicates = 0  # Track duplicate IDs within a batch (e.g. overlapping source dumps)
         total_comments = len(comments)
         current_batch_size = initial_batch_size
 
@@ -1019,6 +1035,11 @@ def insert_comments_batch(
                         # Use PostgreSQL COPY protocol for true streaming (no buffering)
                         copy_buffer = StringIO()
                         records_prepared = 0
+                        # Dedupe IDs within the batch: comments_staging has a PRIMARY KEY,
+                        # so a repeated ID (common when source dumps overlap) aborts the whole
+                        # COPY — not just the dup row. Cross-batch repeats are handled by the
+                        # ON CONFLICT upsert below; only same-batch collisions break COPY.
+                        seen_ids: set[str] = set()
 
                         for comment in batch:
                             # Validate comment has a valid ID before attempting insertion
@@ -1027,6 +1048,12 @@ def insert_comments_batch(
                                 skipped += 1
                                 continue
 
+                            id_key = str(comment_id)
+                            if id_key in seen_ids:
+                                duplicates += 1
+                                continue
+                            seen_ids.add(id_key)
+
                             try:
                                 # Extract post_id (parent thread ID)
                                 # For multi-platform support, prefer post_id field from normalized data
@@ -1192,6 +1219,9 @@ def insert_comments_batch(
             if skipped > 0:
                 print_warning(f"Skipped {skipped} comments with missing or empty IDs")
 
+            if duplicates > 0:
+                print_warning(f"Skipped {duplicates} duplicate comment IDs within batches (overlapping source data)")
+
             if failed > 0:
                 print_warning(f"Failed to insert {failed} comments")
 
diff --git a/reddarc.py b/reddarc.py
@@ -639,6 +639,25 @@ def detect_resume_state_and_files(
         return "start_fresh", None, {}
 
 
+def _import_needs_source_files(args: argparse.Namespace) -> bool:
+    """Whether the run consumes --comments-file/--submissions-file.
+
+    Only a real import from the positional input reads the source files.
+    --export-from-database renders from the database, and the --enrich* paths
+    read from metadata dumps — none of them touch the comments/submissions
+    files, so those flags must not be required in those modes.
+    """
+    return not (
+        args.export_from_database
+        or args.enrich
+        or args.enrich_metadata
+        or args.enrich_rules
+        or args.enrich_wikis
+        or args.enrich_voat
+        or args.voat_thumbnails
+    )
+
+
 def main() -> None:
     parser = argparse.ArgumentParser(
         description="Generate multi-platform archive websites from Reddit, Voat, and Ruqqus data",
@@ -966,19 +985,23 @@ def main() -> None:
 
     # Validate single community mode arguments (subreddit/subverse/guild)
     community_filter = args.subreddit or args.subverses or args.guilds
+    # Source files are only consumed when importing from the positional input;
+    # export/enrich modes read from the DB or metadata dumps instead.
+    import_needs_files = _import_needs_source_files(args)
     if community_filter:
-        if not (args.comments_file and args.submissions_file):
-            platform_name = "subreddit" if args.subreddit else ("subverse" if args.subverses else "guild")
-            print_error(f"Single {platform_name} mode requires both --comments-file and --submissions-file")
-            print_info(
-                f"Example: python reddarc.py /data --{platform_name} example --comments-file /data/example_comments.zst --submissions-file /data/example_submissions.zst"
-            )
-            return
-        if not all(os.path.exists(f) for f in [args.comments_file, args.submissions_file]):
-            print_error("One or both specified files do not exist:")
-            print_error(f"  Comments: {args.comments_file} (exists: {os.path.exists(args.comments_file)})")
-            print_error(f"  Submissions: {args.submissions_file} (exists: {os.path.exists(args.submissions_file)})")
-            return
+        if import_needs_files:
+            if not (args.comments_file and args.submissions_file):
+                platform_name = "subreddit" if args.subreddit else ("subverse" if args.subverses else "guild")
+                print_error(f"Single {platform_name} mode requires both --comments-file and --submissions-file")
+                print_info(
+                    f"Example: python reddarc.py /data --{platform_name} example --comments-file /data/example_comments.zst --submissions-file /data/example_submissions.zst"
+                )
+                return
+            if not all(os.path.exists(f) for f in [args.comments_file, args.submissions_file]):
+                print_error("One or both specified files do not exist:")
+                print_error(f"  Comments: {args.comments_file} (exists: {os.path.exists(args.comments_file)})")
+                print_error(f"  Submissions: {args.submissions_file} (exists: {os.path.exists(args.submissions_file)})")
+                return
         prefix = "r/" if args.subreddit else ("v/" if args.subverses else "g/")
         print_info(f"Single community mode: processing {prefix}{community_filter}")
     elif args.comments_file or args.submissions_file:
@@ -1009,6 +1032,14 @@ def main() -> None:
             process_export_only(args.input_dir or ".", args.output, {}, args)
         return
 
+    # Export-only mode reads everything from the database — no source files or
+    # input-dir discovery required. Dispatch here, before the import path, so a
+    # plain `--export-from-database` does not demand --comments-file/
+    # --submissions-file or a populated input directory.
+    if args.export_from_database:
+        process_export_only(args.input_dir or ".", args.output, {}, args)
+        return
+
     # Validate input directory
     if not args.input_dir:
         print_error("input_dir is required (directory containing .zst files)")
diff --git a/tests/test_cli_validation.py b/tests/test_cli_validation.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python
+"""
+ABOUTME: Unit tests for reddarc CLI argument validation helpers
+ABOUTME: Covers which run modes require --comments-file/--submissions-file
+"""
+
+import argparse
+
+import pytest
+
+from reddarc import _import_needs_source_files
+
+
+def _args(**overrides) -> argparse.Namespace:
+    """Build an args namespace with all mode flags off, then apply overrides."""
+    base = {
+        "export_from_database": False,
+        "enrich": None,
+        "enrich_metadata": None,
+        "enrich_rules": None,
+        "enrich_wikis": None,
+        "enrich_voat": None,
+        "voat_thumbnails": None,
+    }
+    base.update(overrides)
+    return argparse.Namespace(**base)
+
+
+@pytest.mark.unit
+class TestImportNeedsSourceFiles:
+    """A community filter only requires source files for a real import."""
+
+    def test_plain_import_requires_files(self):
+        # No export/enrich flag set: a normal import reads the source files.
+        assert _import_needs_source_files(_args()) is True
+
+    def test_export_from_database_does_not_require_files(self):
+        # Regression: `--export-from-database --subverse x` used to demand
+        # --comments-file/--submissions-file even though it renders from the DB.
+        assert _import_needs_source_files(_args(export_from_database=True)) is False
+
+    @pytest.mark.parametrize(
+        "flag",
+        ["enrich", "enrich_metadata", "enrich_rules", "enrich_wikis", "enrich_voat", "voat_thumbnails"],
+    )
+    def test_enrich_modes_do_not_require_files(self, flag):
+        # Enrichment reads metadata dumps / the DB, not the comments/submissions files.
+        assert _import_needs_source_files(_args(**{flag: "/some/path"})) is False
+
+    def test_enrich_chained_with_export_still_does_not_require_files(self):
+        assert _import_needs_source_files(_args(enrich_voat="/p", export_from_database=True)) is False
diff --git a/tests/test_postgres_database_extended.py b/tests/test_postgres_database_extended.py
@@ -115,6 +115,62 @@ def test_insert_posts_batch_with_duplicates(self, postgres_db):
             cur.execute("DELETE FROM posts WHERE subreddit = 'test_dup'")
             conn.commit()
 
+    def test_insert_posts_batch_within_batch_duplicate_ids(self, postgres_db):
+        """A duplicate ID within a single batch must not abort the whole COPY.
+
+        Overlapping source dumps repeat the same row inside one batch; the
+        posts_staging PRIMARY KEY would otherwise fail the entire COPY (taking
+        every other post in the batch down with it). The duplicate is dropped
+        and the remaining rows still land.
+        """
+        posts = [
+            {
+                "id": "wb_dup_post",
+                "subreddit": "test_wbdup",
+                "author": "a",
+                "title": "First",
+                "created_utc": 1640000000,
+                "score": 10,
+                "permalink": "/r/test_wbdup/comments/wb_dup_post/",
+                "platform": "reddit",
+            },
+            {
+                "id": "wb_dup_post",  # same ID, same batch
+                "subreddit": "test_wbdup",
+                "author": "a",
+                "title": "Duplicate",
+                "created_utc": 1640000000,
+                "score": 99,
+                "permalink": "/r/test_wbdup/comments/wb_dup_post/",
+                "platform": "reddit",
+            },
+            {
+                "id": "wb_unique_post",
+                "subreddit": "test_wbdup",
+                "author": "a",
+                "title": "Unique",
+                "created_utc": 1640000001,
+                "score": 5,
+                "permalink": "/r/test_wbdup/comments/wb_unique_post/",
+                "platform": "reddit",
+            },
+        ]
+
+        _successful, failed, _failed_ids = postgres_db.insert_posts_batch(posts)
+
+        # The whole batch must not be lost to the duplicate: both distinct IDs land.
+        assert failed == 0
+        with postgres_db.pool.get_connection() as conn, conn.cursor() as cur:
+            cur.execute("SELECT COUNT(DISTINCT id) FROM posts WHERE subreddit = 'test_wbdup'")
+            assert cur.fetchone()["count"] == 2
+            cur.execute("SELECT COUNT(*) FROM posts WHERE id = 'wb_dup_post'")
+            assert cur.fetchone()["count"] == 1
+
+        # Cleanup
+        with postgres_db.pool.get_connection() as conn, conn.cursor() as cur:
+            cur.execute("DELETE FROM posts WHERE subreddit = 'test_wbdup'")
+            conn.commit()
+
     def test_insert_posts_batch_empty_list(self, postgres_db):
         """Test empty batch insertion."""
         successful, failed, _failed_ids = postgres_db.insert_posts_batch([])
@@ -213,6 +269,62 @@ def test_insert_comments_batch_basic(self, postgres_db):
             cur.execute("DELETE FROM posts WHERE id = 'comment_parent_post'")
             conn.commit()
 
+    def test_insert_comments_batch_within_batch_duplicate_ids(self, postgres_db):
+        """A duplicate comment ID within a single batch must not abort the whole COPY.
+
+        This is the exact failure observed on the Voat searchvoat dump, where
+        overlapping exports repeated a comment row: comments_staging's PRIMARY
+        KEY failed the COPY and dropped the entire 1000-row batch. The duplicate
+        is now skipped and the rest of the batch still lands.
+        """
+        parent_post = {
+            "id": "wbc_parent_post",
+            "subreddit": "test_wbcdup",
+            "author": "post_author",
+            "title": "Parent",
+            "created_utc": 1640000000,
+            "score": 100,
+            "permalink": "/r/test_wbcdup/comments/wbc_parent_post/",
+            "platform": "reddit",
+        }
+        postgres_db.insert_posts_batch([parent_post])
+
+        def _comment(cid: str, score: int, body: str) -> dict:
+            return {
+                "id": cid,
+                "subreddit": "test_wbcdup",
+                "author": "commenter",
+                "body": body,
+                "created_utc": 1640000100,
+                "score": score,
+                "post_id": "wbc_parent_post",
+                "link_id": "t3_wbc_parent_post",
+                "parent_id": "t3_wbc_parent_post",
+                "permalink": "/r/test_wbcdup/comments/wbc_parent_post/_/" + cid + "/",
+                "platform": "reddit",
+            }
+
+        comments = [
+            _comment("wbc_dup", 10, "first"),
+            _comment("wbc_dup", 99, "duplicate in same batch"),
+            _comment("wbc_unique", 5, "unique"),
+        ]
+
+        _successful, failed = postgres_db.insert_comments_batch(comments)
+
+        assert failed == 0
+        with postgres_db.pool.get_connection() as conn, conn.cursor() as cur:
+            cur.execute("SELECT COUNT(DISTINCT id) FROM comments WHERE subreddit = 'test_wbcdup'")
+            assert cur.fetchone()["count"] == 2
+            cur.execute("SELECT COUNT(*) FROM comments WHERE id = 'wbc_dup'")
+            assert cur.fetchone()["count"] == 1
+
+        # Cleanup
+        with postgres_db.pool.get_connection() as conn, conn.cursor() as cur:
+            cur.execute("DELETE FROM comments WHERE subreddit = 'test_wbcdup'")
+            cur.execute("DELETE FROM posts WHERE id = 'wbc_parent_post'")
+            conn.commit()
+
     def test_insert_comments_batch_empty_list(self, postgres_db):
         """Test empty comment batch insertion."""
         successful, failed = postgres_db.insert_comments_batch([])