Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# Generated by Django 5.2.8 on 2025-12-15 14:07

import hashlib
from django.db import migrations, models
from django.core.files.storage import default_storage


def backfill_attachment_hashes(apps, schema_editor):
"""
Compute and store SHA256 hashes for all existing attachments.
Uses chunked reading to avoid loading large files into memory.
"""
EvidenceRevision = apps.get_model("core", "EvidenceRevision")

revisions_with_attachments = EvidenceRevision.objects.filter(
attachment__isnull=False
).exclude(attachment="")

total = revisions_with_attachments.count()
processed = 0
errors = 0

print(f"Backfilling attachment hashes for {total} evidence revisions...")

for revision in revisions_with_attachments.iterator(chunk_size=100):
try:
if revision.attachment and default_storage.exists(revision.attachment.name):
hash_obj = hashlib.sha256()
with default_storage.open(revision.attachment.name, "rb") as f:
for chunk in iter(lambda: f.read(1024 * 1024), b""): # 1MB chunks
hash_obj.update(chunk)

revision.attachment_hash = hash_obj.hexdigest()
revision.save(update_fields=["attachment_hash"])
processed += 1

if processed % 100 == 0:
print(f" Processed {processed}/{total} revisions...")
except Exception as e:
errors += 1
print(f" Error processing revision {revision.id}: {e}")

print(f"Completed: {processed} hashes computed, {errors} errors")


def reverse_backfill(apps, schema_editor):
"""
Clear all attachment hashes (reverse operation).
"""
EvidenceRevision = apps.get_model("core", "EvidenceRevision")
EvidenceRevision.objects.update(attachment_hash=None)


class Migration(migrations.Migration):
dependencies = [
("core", "0118_riskscenario_antecedent_scenarios_and_more"),
]

operations = [
migrations.AddField(
model_name="evidencerevision",
name="attachment_hash",
field=models.CharField(
blank=True,
db_index=True,
help_text="SHA256 hash of the attachment file for integrity verification",
max_length=64,
null=True,
verbose_name="Attachment SHA256 Hash",
),
),
migrations.RunPython(backfill_attachment_hashes, reverse_backfill),
]
58 changes: 58 additions & 0 deletions backend/core/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from django.core import serializers
from django.core.exceptions import ValidationError
from django.core.validators import MaxValueValidator, RegexValidator, MinValueValidator
from django.core.files.storage import default_storage
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Prefer self.attachment.storage over default_storage for hashing reads

If EvidenceRevision.attachment ever uses a non-default storage backend, default_storage may read a different file than the one referenced by the field. Consider using the field’s storage (fallback to default_storage only if needed).

Proposed fix (also removes the need for exists())
-                    if default_storage.exists(self.attachment.name):
-                        with default_storage.open(self.attachment.name, "rb") as f:
+                    storage = getattr(self.attachment, "storage", default_storage)
+                    try:
+                        f = storage.open(self.attachment.name, "rb")
+                    except Exception:
+                        f = None
+                    if f is not None:
+                        with f:
                             for chunk in iter(
                                 lambda: f.read(1024 * 1024), b""
                             ):  # 1MB chunks
                                 hash_obj.update(chunk)
                         self.attachment_hash = hash_obj.hexdigest()
                     else:
🤖 Prompt for AI Agents
In @backend/core/models.py at line 22, The code uses default_storage for hashing
reads which can misread files if EvidenceRevision.attachment uses a non-default
backend; update any reads to use the FileField's storage (i.e., use
self.attachment.storage) and only fall back to default_storage if the field or
its storage is missing, and remove any redundant exists() checks that are
unnecessary when using the field storage; locate usages related to
EvidenceRevision.attachment and replace
default_storage.open/default_storage.path/default_storage.exists calls with
self.attachment.storage equivalents (with a default_storage fallback).

from django.db import models, transaction
from django.db.models import F, Q, OuterRef, Subquery
from django.forms.models import model_to_dict
Expand Down Expand Up @@ -3406,6 +3407,14 @@ class EvidenceRevision(AbstractBaseModel, FolderMixin):
verbose_name=_("Attachment"),
validators=[validate_file_size, validate_file_name],
)
attachment_hash = models.CharField(
max_length=64,
blank=True,
null=True,
db_index=True,
verbose_name=_("Attachment SHA256 Hash"),
help_text=_("SHA256 hash of the attachment file for integrity verification"),
)
link = models.URLField(
blank=True,
null=True,
Expand All @@ -3426,6 +3435,55 @@ def save(self, *args, **kwargs):

self.is_published = self.evidence.is_published

# Compute attachment hash if attachment exists and has changed
if self.attachment:
# Check if this is a new attachment or if it has changed
should_compute_hash = False

if self.pk: # Existing record
try:
old_instance = EvidenceRevision.objects.get(pk=self.pk)
# Check if attachment changed
if old_instance.attachment != self.attachment:
should_compute_hash = True
except EvidenceRevision.DoesNotExist:
should_compute_hash = True
else: # New record
should_compute_hash = True

if should_compute_hash:
try:
# Compute SHA256 hash using chunked reading to avoid OOM
hash_obj = hashlib.sha256()
if default_storage.exists(self.attachment.name):
with default_storage.open(self.attachment.name, "rb") as f:
for chunk in iter(
lambda: f.read(1024 * 1024), b""
): # 1MB chunks
hash_obj.update(chunk)
self.attachment_hash = hash_obj.hexdigest()
else:
# File not yet saved to storage, try reading from UploadedFile
if hasattr(self.attachment, "chunks"):
for chunk in self.attachment.chunks(chunk_size=1024 * 1024):
hash_obj.update(chunk)
self.attachment_hash = hash_obj.hexdigest()
# Reset file position for subsequent operations
if hasattr(self.attachment, "seek"):
self.attachment.seek(0)
except Exception as e:
logger = get_logger(__name__)
logger.warning(
"Failed to compute attachment hash",
revision_id=self.pk,
error=str(e),
)
# Don't fail the save if hash computation fails
self.attachment_hash = None
else:
# No attachment, clear the hash
self.attachment_hash = None

super().save(*args, **kwargs)

def filename(self):
Expand Down
194 changes: 194 additions & 0 deletions backend/serdes/export_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
import io
import os
import zipfile
from typing import Optional, BinaryIO

import structlog
from django.core.files.storage import default_storage
from django.db.models import QuerySet

from core.models import Evidence, EvidenceRevision

logger = structlog.get_logger(__name__)


class AttachmentExporter:
def collect_all_attachments(self, scope: Optional[QuerySet] = None) -> QuerySet:
if scope is None:
revisions = EvidenceRevision.objects.all()
else:
revisions = scope

return revisions.filter(attachment__isnull=False).select_related(
"evidence", "folder"
)

def package_attachments_to_zip(
self, revisions: QuerySet, zipf: zipfile.ZipFile
) -> int:
count = 0

for revision in revisions:
if revision.attachment and default_storage.exists(revision.attachment.name):
try:
with default_storage.open(revision.attachment.name, "rb") as file:
file_content = file.read()

filename = (
f"{revision.evidence_id}_v{revision.version}_"
f"{os.path.basename(revision.attachment.name)}"
)

zip_path = os.path.join(
"attachments", "evidence-revisions", filename
)

zipf.writestr(zip_path, file_content)
count += 1

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Exporter: avoid full in-RAM reads + use POSIX zip paths (Windows-safe).

 import io
 import os
+import posixpath
+import shutil
 import zipfile
@@
-                        zip_path = os.path.join(
-                            "attachments", "evidence-revisions", filename
-                        )
-
-                        zipf.writestr(zip_path, file_content)
+                        zip_path = posixpath.join(
+                            "attachments", "evidence-revisions", filename
+                        )
+                        with zipf.open(zip_path, "w") as zf:
+                            shutil.copyfileobj(file, zf, length=1024 * 1024)
                         count += 1

Also applies to: 61-82

🤖 Prompt for AI Agents
In backend/serdes/export_utils.py around lines 26-48 (and similarly 61-82), the
exporter currently reads entire attachment files into memory and builds zip
paths with os.path.join (which yields backslashes on Windows); change to stream
the file into the zip to avoid full in-RAM reads and use POSIX-style zip paths.
Specifically, open the storage file as a binary file-like object and create the
entry in the ZipFile via zipfile.ZipFile.open(zip_path, "w") and copy the file
contents into that entry in chunks (e.g., shutil.copyfileobj) instead of
file.read(); build zip_path using posixpath.join("attachments",
"evidence-revisions", filename) so the stored path uses forward slashes and
apply the same changes to the other block at lines 61-82.

except Exception as e:
logger.error(
"Failed to add attachment to ZIP",
revision_id=revision.id,
evidence_id=revision.evidence_id,
attachment_name=revision.attachment.name,
error=str(e),
)
continue

return count

def create_attachments_zip(
self, revisions: Optional[QuerySet] = None
) -> tuple[io.BytesIO, int]:
if revisions is None:
revisions = self.collect_all_attachments()

logger.info("Creating attachments ZIP", total_revisions=revisions.count())

zip_buffer = io.BytesIO()

with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zipf:
count = self.package_attachments_to_zip(revisions, zipf)

zip_buffer.seek(0)

logger.info(
"Attachments ZIP created successfully",
attachments_count=count,
zip_size=len(zip_buffer.getvalue()),
)

return zip_buffer, count


class AttachmentImporter:
def extract_attachments_from_zip(
self, zip_file: BinaryIO, dry_run: bool = False
) -> dict:
stats = {"processed": 0, "restored": 0, "errors": []}

try:
with zipfile.ZipFile(zip_file, "r") as zipf:
attachment_files = [
f
for f in zipf.namelist()
if f.startswith("attachments/evidence-revisions/")
and not f.endswith("/")
]

stats["processed"] = len(attachment_files)

logger.info(
"Starting attachment import",
total_files=stats["processed"],
dry_run=dry_run,
)

for file_path in attachment_files:
try:
filename = os.path.basename(file_path)
parts = filename.split("_", 2)

if len(parts) < 3:
stats["errors"].append(
f"Invalid filename format: {filename}"
)
continue

evidence_id = parts[0]
version_str = parts[1]
original_filename = parts[2]

if not version_str.startswith("v"):
stats["errors"].append(
f"Invalid version format in: {filename}"
)
continue

version = int(version_str[1:])

if not dry_run:
# Find the corresponding EvidenceRevision
try:
revision = EvidenceRevision.objects.get(
evidence_id=evidence_id, version=version
)

file_content = zipf.read(file_path)

storage_path = (
f"evidence-revisions/{evidence_id}/"
f"v{version}/{original_filename}"
)

saved_path = default_storage.save(
storage_path, io.BytesIO(file_content)
)

revision.attachment = saved_path
revision.save(update_fields=["attachment"])

stats["restored"] += 1

except EvidenceRevision.DoesNotExist:
stats["errors"].append(
f"EvidenceRevision not found: "
f"evidence_id={evidence_id}, version={version}"
)
except Exception as e:
stats["errors"].append(f"Failed to restore {filename}")
else:
stats["restored"] += 1

except Exception as e:
logger.error(
"Error processing file path",
file_path=file_path,
error=str(e),
exc_info=True,
)
stats["errors"].append(
f"Error processing {os.path.basename(file_path)}"
)
continue

except zipfile.BadZipFile:
stats["errors"].append("Invalid ZIP file")
except Exception as e:
logger.error(
"Unexpected error during attachment import",
error=str(e),
exc_info=True,
)
stats["errors"].append("Unexpected error occurred during import")

logger.info(
"Attachment import completed",
processed=stats["processed"],
restored=stats["restored"],
errors_count=len(stats["errors"]),
dry_run=dry_run,
)

return stats
26 changes: 26 additions & 0 deletions backend/serdes/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,30 @@
views.LoadBackupView.as_view(),
name="load-backup",
),
path(
"export-attachments/",
views.ExportAttachmentsView.as_view(),
name="export-attachments",
),
path(
"full-restore/",
views.FullRestoreView.as_view(),
name="full-restore",
),
# New streaming batch endpoints
path(
"attachment-metadata/",
views.AttachmentMetadataView.as_view(),
name="attachment-metadata",
),
path(
"batch-download-attachments/",
views.BatchDownloadAttachmentsView.as_view(),
name="batch-download-attachments",
),
path(
"batch-upload-attachments/",
views.BatchUploadAttachmentsView.as_view(),
name="batch-upload-attachments",
),
]
Loading
Loading