intuitem · tchoumi313 · Dec 12, 2025 · Dec 15, 2025 · Dec 16, 2025 · Dec 17, 2025
diff --git a/backend/core/migrations/0119_add_attachment_hash_to_evidencerevision.py b/backend/core/migrations/0119_add_attachment_hash_to_evidencerevision.py
@@ -0,0 +1,73 @@
+# Generated by Django 5.2.8 on 2025-12-15 14:07
+
+import hashlib
+from django.db import migrations, models
+from django.core.files.storage import default_storage
+
+
+def backfill_attachment_hashes(apps, schema_editor):
+    """
+    Compute and store SHA256 hashes for all existing attachments.
+    Uses chunked reading to avoid loading large files into memory.
+    """
+    EvidenceRevision = apps.get_model("core", "EvidenceRevision")
+
+    revisions_with_attachments = EvidenceRevision.objects.filter(
+        attachment__isnull=False
+    ).exclude(attachment="")
+
+    total = revisions_with_attachments.count()
+    processed = 0
+    errors = 0
+
+    print(f"Backfilling attachment hashes for {total} evidence revisions...")
+
+    for revision in revisions_with_attachments.iterator(chunk_size=100):
+        try:
+            if revision.attachment and default_storage.exists(revision.attachment.name):
+                hash_obj = hashlib.sha256()
+                with default_storage.open(revision.attachment.name, "rb") as f:
+                    for chunk in iter(lambda: f.read(1024 * 1024), b""):  # 1MB chunks
+                        hash_obj.update(chunk)
+
+                revision.attachment_hash = hash_obj.hexdigest()
+                revision.save(update_fields=["attachment_hash"])
+                processed += 1
+
+                if processed % 100 == 0:
+                    print(f"  Processed {processed}/{total} revisions...")
+        except Exception as e:
+            errors += 1
+            print(f"  Error processing revision {revision.id}: {e}")
+
+    print(f"Completed: {processed} hashes computed, {errors} errors")
+
+
+def reverse_backfill(apps, schema_editor):
+    """
+    Clear all attachment hashes (reverse operation).
+    """
+    EvidenceRevision = apps.get_model("core", "EvidenceRevision")
+    EvidenceRevision.objects.update(attachment_hash=None)
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("core", "0118_riskscenario_antecedent_scenarios_and_more"),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name="evidencerevision",
+            name="attachment_hash",
+            field=models.CharField(
+                blank=True,
+                db_index=True,
+                help_text="SHA256 hash of the attachment file for integrity verification",
+                max_length=64,
+                null=True,
+                verbose_name="Attachment SHA256 Hash",
+            ),
+        ),
+        migrations.RunPython(backfill_attachment_hashes, reverse_backfill),
+    ]
diff --git a/backend/core/models.py b/backend/core/models.py
@@ -19,6 +19,7 @@
 from django.core import serializers
 from django.core.exceptions import ValidationError
 from django.core.validators import MaxValueValidator, RegexValidator, MinValueValidator
+from django.core.files.storage import default_storage
 from django.db import models, transaction
 from django.db.models import F, Q, OuterRef, Subquery
 from django.forms.models import model_to_dict
@@ -3406,6 +3407,14 @@ class EvidenceRevision(AbstractBaseModel, FolderMixin):
         verbose_name=_("Attachment"),
         validators=[validate_file_size, validate_file_name],
     )
+    attachment_hash = models.CharField(
+        max_length=64,
+        blank=True,
+        null=True,
+        db_index=True,
+        verbose_name=_("Attachment SHA256 Hash"),
+        help_text=_("SHA256 hash of the attachment file for integrity verification"),
+    )
     link = models.URLField(
         blank=True,
         null=True,
@@ -3426,6 +3435,55 @@ def save(self, *args, **kwargs):
 
         self.is_published = self.evidence.is_published
 
+        # Compute attachment hash if attachment exists and has changed
+        if self.attachment:
+            # Check if this is a new attachment or if it has changed
+            should_compute_hash = False
+
+            if self.pk:  # Existing record
+                try:
+                    old_instance = EvidenceRevision.objects.get(pk=self.pk)
+                    # Check if attachment changed
+                    if old_instance.attachment != self.attachment:
+                        should_compute_hash = True
+                except EvidenceRevision.DoesNotExist:
+                    should_compute_hash = True
+            else:  # New record
+                should_compute_hash = True
+
+            if should_compute_hash:
+                try:
+                    # Compute SHA256 hash using chunked reading to avoid OOM
+                    hash_obj = hashlib.sha256()
+                    if default_storage.exists(self.attachment.name):
+                        with default_storage.open(self.attachment.name, "rb") as f:
+                            for chunk in iter(
+                                lambda: f.read(1024 * 1024), b""
+                            ):  # 1MB chunks
+                                hash_obj.update(chunk)
+                        self.attachment_hash = hash_obj.hexdigest()
+                    else:
+                        # File not yet saved to storage, try reading from UploadedFile
+                        if hasattr(self.attachment, "chunks"):
+                            for chunk in self.attachment.chunks(chunk_size=1024 * 1024):
+                                hash_obj.update(chunk)
+                            self.attachment_hash = hash_obj.hexdigest()
+                            # Reset file position for subsequent operations
+                            if hasattr(self.attachment, "seek"):
+                                self.attachment.seek(0)
+                except Exception as e:
+                    logger = get_logger(__name__)
+                    logger.warning(
+                        "Failed to compute attachment hash",
+                        revision_id=self.pk,
+                        error=str(e),
+                    )
+                    # Don't fail the save if hash computation fails
+                    self.attachment_hash = None
+        else:
+            # No attachment, clear the hash
+            self.attachment_hash = None
+
         super().save(*args, **kwargs)
 
     def filename(self):

diff --git a/backend/serdes/export_utils.py b/backend/serdes/export_utils.py
@@ -0,0 +1,194 @@
+import io
+import os
+import zipfile
+from typing import Optional, BinaryIO
+
+import structlog
+from django.core.files.storage import default_storage
+from django.db.models import QuerySet
+
+from core.models import Evidence, EvidenceRevision
+
+logger = structlog.get_logger(__name__)
+
+
+class AttachmentExporter:
+    def collect_all_attachments(self, scope: Optional[QuerySet] = None) -> QuerySet:
+        if scope is None:
+            revisions = EvidenceRevision.objects.all()
+        else:
+            revisions = scope
+
+        return revisions.filter(attachment__isnull=False).select_related(
+            "evidence", "folder"
+        )
+
+    def package_attachments_to_zip(
+        self, revisions: QuerySet, zipf: zipfile.ZipFile
+    ) -> int:
+        count = 0
+
+        for revision in revisions:
+            if revision.attachment and default_storage.exists(revision.attachment.name):
+                try:
+                    with default_storage.open(revision.attachment.name, "rb") as file:
+                        file_content = file.read()
+
+                        filename = (
+                            f"{revision.evidence_id}_v{revision.version}_"
+                            f"{os.path.basename(revision.attachment.name)}"
+                        )
+
+                        zip_path = os.path.join(
+                            "attachments", "evidence-revisions", filename
+                        )
+
+                        zipf.writestr(zip_path, file_content)
+                        count += 1
+
+                except Exception as e:
+                    logger.error(
+                        "Failed to add attachment to ZIP",
+                        revision_id=revision.id,
+                        evidence_id=revision.evidence_id,
+                        attachment_name=revision.attachment.name,
+                        error=str(e),
+                    )
+                    continue
+
+        return count
+
+    def create_attachments_zip(
+        self, revisions: Optional[QuerySet] = None
+    ) -> tuple[io.BytesIO, int]:
+        if revisions is None:
+            revisions = self.collect_all_attachments()
+
+        logger.info("Creating attachments ZIP", total_revisions=revisions.count())
+
+        zip_buffer = io.BytesIO()
+
+        with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zipf:
+            count = self.package_attachments_to_zip(revisions, zipf)
+
+        zip_buffer.seek(0)
+
+        logger.info(
+            "Attachments ZIP created successfully",
+            attachments_count=count,
+            zip_size=len(zip_buffer.getvalue()),
+        )
+
+        return zip_buffer, count
+
+
+class AttachmentImporter:
+    def extract_attachments_from_zip(
+        self, zip_file: BinaryIO, dry_run: bool = False
+    ) -> dict:
+        stats = {"processed": 0, "restored": 0, "errors": []}
+
+        try:
+            with zipfile.ZipFile(zip_file, "r") as zipf:
+                attachment_files = [
+                    f
+                    for f in zipf.namelist()
+                    if f.startswith("attachments/evidence-revisions/")
+                    and not f.endswith("/")
+                ]
+
+                stats["processed"] = len(attachment_files)
+
+                logger.info(
+                    "Starting attachment import",
+                    total_files=stats["processed"],
+                    dry_run=dry_run,
+                )
+
+                for file_path in attachment_files:
+                    try:
+                        filename = os.path.basename(file_path)
+                        parts = filename.split("_", 2)
+
+                        if len(parts) < 3:
+                            stats["errors"].append(
+                                f"Invalid filename format: {filename}"
+                            )
+                            continue
+
+                        evidence_id = parts[0]
+                        version_str = parts[1]
+                        original_filename = parts[2]
+
+                        if not version_str.startswith("v"):
+                            stats["errors"].append(
+                                f"Invalid version format in: {filename}"
+                            )
+                            continue
+
+                        version = int(version_str[1:])
+
+                        if not dry_run:
+                            # Find the corresponding EvidenceRevision
+                            try:
+                                revision = EvidenceRevision.objects.get(
+                                    evidence_id=evidence_id, version=version
+                                )
+
+                                file_content = zipf.read(file_path)
+
+                                storage_path = (
+                                    f"evidence-revisions/{evidence_id}/"
+                                    f"v{version}/{original_filename}"
+                                )
+
+                                saved_path = default_storage.save(
+                                    storage_path, io.BytesIO(file_content)
+                                )
+
+                                revision.attachment = saved_path
+                                revision.save(update_fields=["attachment"])
+
+                                stats["restored"] += 1
+
+                            except EvidenceRevision.DoesNotExist:
+                                stats["errors"].append(
+                                    f"EvidenceRevision not found: "
+                                    f"evidence_id={evidence_id}, version={version}"
+                                )
+                            except Exception as e:
+                                stats["errors"].append(f"Failed to restore {filename}")
+                        else:
+                            stats["restored"] += 1
+
+                    except Exception as e:
+                        logger.error(
+                            "Error processing file path",
+                            file_path=file_path,
+                            error=str(e),
+                            exc_info=True,
+                        )
+                        stats["errors"].append(
+                            f"Error processing {os.path.basename(file_path)}"
+                        )
+                        continue
+
+        except zipfile.BadZipFile:
+            stats["errors"].append("Invalid ZIP file")
+        except Exception as e:
+            logger.error(
+                "Unexpected error during attachment import",
+                error=str(e),
+                exc_info=True,
+            )
+            stats["errors"].append("Unexpected error occurred during import")
+
+        logger.info(
+            "Attachment import completed",
+            processed=stats["processed"],
+            restored=stats["restored"],
+            errors_count=len(stats["errors"]),
+            dry_run=dry_run,
+        )
+
+        return stats
diff --git a/backend/serdes/urls.py b/backend/serdes/urls.py
@@ -9,4 +9,30 @@
         views.LoadBackupView.as_view(),
         name="load-backup",
     ),
+    path(
+        "export-attachments/",
+        views.ExportAttachmentsView.as_view(),
+        name="export-attachments",
+    ),
+    path(
+        "full-restore/",
+        views.FullRestoreView.as_view(),
+        name="full-restore",
+    ),
+    # New streaming batch endpoints
+    path(
+        "attachment-metadata/",
+        views.AttachmentMetadataView.as_view(),
+        name="attachment-metadata",
+    ),
+    path(
+        "batch-download-attachments/",
+        views.BatchDownloadAttachmentsView.as_view(),
+        name="batch-download-attachments",
+    ),
+    path(
+        "batch-upload-attachments/",
+        views.BatchUploadAttachmentsView.as_view(),
+        name="batch-upload-attachments",
+    ),
 ]