Skip to content
Draft
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
6dbd910
outline-import: backend upload endpoint + frontend upload page\n\nBac…
NicolasRitouet Sep 13, 2025
1fd4406
frontend(import-outline): fix baseApiUrl import path
NicolasRitouet Sep 13, 2025
becc514
outline-import: run malware scan on uploaded assets
NicolasRitouet Sep 13, 2025
9f4fb06
tests(outline-import): add API tests for upload (.zip) flow\n- Anonym…
NicolasRitouet Sep 13, 2025
4f3b62d
refactor(outline-import): move import logic to core/services/outline_…
NicolasRitouet Sep 13, 2025
fa65c45
outline-import: reinforce safety and tests\n- Zip Slip protection (re…
NicolasRitouet Sep 13, 2025
cce6c96
Add Outline import API view
NicolasRitouet Sep 16, 2025
6146a48
Remove legacy Outline import viewset
NicolasRitouet Sep 16, 2025
453b153
Improve Outline import validation and UI
NicolasRitouet Sep 17, 2025
b7a7663
feat(outline-import): Add markdown preprocessing for unsupported Bloc…
NicolasRitouet Sep 24, 2025
06d9c2b
Revert "feat(outline-import): Add markdown preprocessing for unsuppor…
NicolasRitouet Sep 26, 2025
95fa210
Merge main into feature/outline-import
NicolasRitouet Oct 9, 2025
68e58b2
fix(outline-import): Fix CSRF token and nested documents handling
NicolasRitouet Oct 12, 2025
538c641
Cleanup imports
NicolasRitouet Oct 12, 2025
619b624
Fix import outline
NicolasRitouet Oct 12, 2025
e1f5a13
add new line
NicolasRitouet Oct 12, 2025
7d6f055
es-lint fixes
NicolasRitouet Oct 13, 2025
1d65ca3
fix(outline-import): Address PR review comments
NicolasRitouet Nov 29, 2025
600672d
Merge upstream/main into feature/outline-import
NicolasRitouet Nov 29, 2025
be6a2cb
fix(outline-import): Add async processing, improve UI, and address PR…
NicolasRitouet Nov 29, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions src/backend/core/api/imports.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
"""Import endpoints for Outline (zip upload)."""

from __future__ import annotations

import io
import zipfile

import rest_framework as drf

from core.services.outline_import import OutlineImportError, process_outline_zip


# ---------- Outline (Zip Upload) ----------


class OutlineImportUploadView(drf.views.APIView):
parser_classes = [drf.parsers.MultiPartParser]
permission_classes = [drf.permissions.IsAuthenticated]

def post(self, request):
uploaded = request.FILES.get("file")
if not uploaded:
raise drf.exceptions.ValidationError({"file": "File is required"})

name = getattr(uploaded, "name", "")
if not name.endswith(".zip"):
raise drf.exceptions.ValidationError({"file": "Must be a .zip file"})
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You should rely on a drf serializer here to validate the input instead of doing it in the view. You can maybe reused the FileUploadSerializer present in the serializer module (src/backend/core/api/serializers.py)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Then once the input validated you have to rely on the malware_detection feature to validate the zip content. But we have to imagine a workflow, this process is async. Once the malware detection ended, the process_outile_zip should start.


try:
content = uploaded.read()
# Fail fast if the upload is not a valid zip archive
with zipfile.ZipFile(io.BytesIO(content)):
pass
created_ids = process_outline_zip(request.user, content)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suggestion to save the uploaded zip on the bucket storage (you can rely on the django storage API). Doing this you can create a celery task to process the file in an async way

except zipfile.BadZipFile as exc:
raise drf.exceptions.ValidationError({"file": "Invalid zip archive"}) from exc
except OutlineImportError as exc:
raise drf.exceptions.ValidationError({"file": str(exc)}) from exc

return drf.response.Response(
{"created_document_ids": created_ids}, status=drf.status.HTTP_201_CREATED
)
205 changes: 205 additions & 0 deletions src/backend/core/services/outline_import.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
"""Service to import an Outline export (.zip) into Docs documents."""

from __future__ import annotations

import io
import mimetypes
import re
import uuid
import zipfile
from typing import Iterable
import posixpath

from django.conf import settings
from django.core.files.storage import default_storage

from lasuite.malware_detection import malware_detection

from core import enums, models
from core.services.converter_services import YdocConverter


class OutlineImportError(Exception):
"""Raised when the Outline archive is invalid or unsafe."""


def _ensure_dir_documents(user, dir_path: str, dir_docs: dict[str, models.Document]) -> models.Document | None:
"""Ensure each path segment in dir_path has a container document.

Returns the deepest parent document or None when dir_path is empty.
"""
if not dir_path:
return None

parts = [p for p in dir_path.split("/") if p]
parent: models.Document | None = None
current = ""
for part in parts:
current = f"{current}/{part}" if current else part
if current in dir_docs:
parent = dir_docs[current]
continue

if parent is None:
doc = models.Document.add_root(
depth=1,
creator=user,
title=part,
link_reach=models.LinkReachChoices.RESTRICTED,
)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
)
)
models.DocumentAccess.objects.create(
document=doc,
user=user,
role=models.RoleChoices.OWNER,
)

else:
doc = parent.add_child(creator=user, title=part)

models.DocumentAccess.objects.update_or_create(
document=doc,
user=user,
defaults={"role": models.RoleChoices.OWNER},
)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
models.DocumentAccess.objects.update_or_create(
document=doc,
user=user,
defaults={"role": models.RoleChoices.OWNER},
)

You have to define an owner access for the user only on the root document. Then the children will inherit from this access.

dir_docs[current] = doc
parent = doc

return parent


def _upload_attachment(user, doc: models.Document, arcname: str, data: bytes) -> str:
"""Upload a binary asset into object storage and return its public media URL."""
content_type, _ = mimetypes.guess_type(arcname)
Copy link
Collaborator

@StephanMeijer StephanMeijer Oct 29, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the library underneath using? Mimetype guessing is not always stable (even when using libmagic it can differ from version/environment).
I would suggest good testing, preferably in different environments if possible.

ext = (arcname.split(".")[-1] or "bin").lower()
file_id = uuid.uuid4()
key = f"{doc.key_base}/{enums.ATTACHMENTS_FOLDER:s}/{file_id!s}.{ext}"
extra_args = {
"Metadata": {
"owner": str(user.id),
"status": enums.DocumentAttachmentStatus.READY,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
"status": enums.DocumentAttachmentStatus.READY,
"status": enums.DocumentAttachmentStatus.PROCESSING,

},
}
if content_type:
extra_args["ContentType"] = content_type

default_storage.connection.meta.client.upload_fileobj(
io.BytesIO(data), default_storage.bucket_name, key, ExtraArgs=extra_args
)
doc.attachments.append(key)
doc.save(update_fields=["attachments", "updated_at"])
malware_detection.analyse_file(key, document_id=doc.id)
return f"{settings.MEDIA_BASE_URL}{settings.MEDIA_URL}{key}"


def process_outline_zip(user, zip_bytes: bytes) -> list[str]:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With the previous comment made, asking to save the file on the bucket, you can transform this function in a celery task and execute it asynchronously

"""Process an Outline export zip and create Docs documents.

Returns the list of created document IDs (stringified UUIDs) corresponding to
markdown-backed documents. Container folders used to rebuild hierarchy are not listed.
"""
archive = zipfile.ZipFile(io.BytesIO(zip_bytes))

# Basic Zip Slip protection: refuse absolute or parent-traversal entries
for name in archive.namelist():
# Normalize to posix separators and check traversal
if name.startswith("/") or "\\" in name:
raise OutlineImportError("Unsafe path in archive")
parts = [p for p in name.split("/") if p]
if any(part == ".." for part in parts):
raise OutlineImportError("Unsafe path in archive")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this always unsafe or only when .. goes beyond the root you are iterating over?


created_ids: list[str] = []
dir_docs: dict[str, models.Document] = {}
md_files: Iterable[str] = sorted(
[
n
for n in archive.namelist()
if n.lower().endswith(".md")
and not n.startswith("__MACOSX/")
and not any(part.startswith(".") for part in n.split("/"))
]
)

# Build a set of md files that have corresponding directories (Outline nested docs)
# e.g., "Doc.md" and "Doc/" both exist -> "Doc" is a parent with nested children
md_with_dirs: set[str] = set()
for md_path in md_files:
# Remove .md extension to get potential directory name
base_path = md_path.rsplit(".md", 1)[0]
# Check if there's a directory with the same name
if any(n.startswith(f"{base_path}/") for n in archive.namelist()):
md_with_dirs.add(base_path)

img_pattern = re.compile(r"!\[[^\]]*\]\(([^)]+)\)")

def read_bytes(path_in_zip: str) -> bytes | None:
try:
with archive.open(path_in_zip, "r") as f:
return f.read()
except KeyError:
return None

converter = YdocConverter()

for md_path in md_files:
dir_path, file_name = (
(md_path.rsplit("/", 1) + [""])[:2] if "/" in md_path else ("", md_path)
)
parent_doc = _ensure_dir_documents(user, dir_path, dir_docs)

try:
raw_md = archive.read(md_path).decode("utf-8", errors="ignore")
except Exception: # noqa: BLE001
raw_md = ""

title_match = re.search(r"^#\s+(.+)$", raw_md, flags=re.MULTILINE)
title = title_match.group(1).strip() if title_match else file_name.rsplit(".", 1)[0]

if parent_doc is None:
doc = models.Document.add_root(
depth=1,
creator=user,
title=title,
link_reach=models.LinkReachChoices.RESTRICTED,
)
else:
doc = parent_doc.add_child(creator=user, title=title)
Comment on lines 159 to 172
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if parent_doc is None:
doc = models.Document.add_root(
depth=1,
creator=user,
title=title,
link_reach=models.LinkReachChoices.RESTRICTED,
)
else:
doc = parent_doc.add_child(creator=user, title=title)

This is managed in _ensure_dir_documents function. You will probably have duplicated docs at the end


# If this md file has a corresponding directory, register it as a container
# so nested children will use this doc as parent instead of creating a duplicate
base_path = md_path.rsplit(".md", 1)[0]
if base_path in md_with_dirs:
dir_docs[base_path] = doc

models.DocumentAccess.objects.update_or_create(
document=doc,
user=user,
defaults={"role": models.RoleChoices.OWNER},
)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
models.DocumentAccess.objects.update_or_create(
document=doc,
user=user,
defaults={"role": models.RoleChoices.OWNER},
)

Managed in the _ensure_dir_documents function


def replace_img_link(match: re.Match[str]) -> str:
url = match.group(1)
if url.startswith("http://") or url.startswith("https://"):
return match.group(0)
asset_rel = f"{dir_path}/{url}" if dir_path else url
asset_rel = re.sub(r"/+", "/", asset_rel)
# sanitize computed asset path
if asset_rel.startswith("/") or any(part == ".." for part in asset_rel.split("/")):
return match.group(0)
data = read_bytes(asset_rel)
if data is None:
return match.group(0)
media_url = _upload_attachment(user, doc, arcname=url, data=data)
return match.group(0).replace(url, media_url)

rewritten_md = img_pattern.sub(replace_img_link, raw_md)

try:
ydoc_b64 = converter.convert(
rewritten_md.encode("utf-8"),
content_type="text/markdown",
accept="application/vnd.yjs.doc",
)
doc.content = ydoc_b64
doc.save()
except Exception: # noqa: BLE001
# Keep doc without content on conversion error but continue import
pass

created_ids.append(str(doc.id))

return created_ids
127 changes: 127 additions & 0 deletions src/backend/core/tests/imports/test_api_outline_import_upload.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
"""Tests for the Outline zip import API endpoint."""

import io
import zipfile
from unittest.mock import patch

from django.core.files.uploadedfile import SimpleUploadedFile

import pytest
from rest_framework.test import APIClient

from core import factories
from core.api.viewsets import malware_detection
from core.services.outline_import import OutlineImportError


pytestmark = pytest.mark.django_db


def make_zip_with_markdown_and_image(md_path: str, md_content: str, img_path: str, img_bytes: bytes) -> bytes:
buf = io.BytesIO()
with zipfile.ZipFile(buf, mode="w") as zf:
zf.writestr(md_path, md_content)
zf.writestr(img_path, img_bytes)
return buf.getvalue()


def test_outline_import_upload_anonymous_forbidden():
"""Anonymous users must not be able to use the import endpoint."""
client = APIClient()

# Minimal empty zip
buf = io.BytesIO()
with zipfile.ZipFile(buf, mode="w"):
pass
upload = SimpleUploadedFile(name="export.zip", content=buf.getvalue(), content_type="application/zip")

response = client.post("/api/v1.0/imports/outline/upload", {"file": upload}, format="multipart")

assert response.status_code == 401
assert response.json()["detail"] == "Authentication credentials were not provided."


@patch("core.services.converter_services.YdocConverter.convert", return_value="YmFzZTY0Y29udGVudA==")
def test_outline_import_upload_authenticated_success(mock_convert):
"""Authenticated users can upload an Outline export zip and create documents."""
user = factories.UserFactory()
client = APIClient()
client.force_login(user)

# Markdown referencing a local image in the same directory
md = "# Imported Title\n\nSome text.\n\n![Alt](image.png)\n"
img = (
b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x06\x00"
b"\x00\x00\x1f\x15\xc4\x89\x00\x00\x00\nIDATx\x9cc\xf8\xff\xff?\x00\x05\xfe\x02\xfe"
b"\xa7V\xbd\xfa\x00\x00\x00\x00IEND\xaeB`\x82"
)
zip_bytes = make_zip_with_markdown_and_image(
md_path="Folder1/page.md",
md_content=md,
img_path="Folder1/image.png",
img_bytes=img,
)

upload = SimpleUploadedFile(name="export.zip", content=zip_bytes, content_type="application/zip")

with patch.object(malware_detection, "analyse_file") as mock_analyse_file:
response = client.post("/api/v1.0/imports/outline/upload", {"file": upload}, format="multipart")

assert response.status_code == 201
data = response.json()
assert "created_document_ids" in data
# Only the markdown-backed document ids are returned (container folders are not listed)
assert len(data["created_document_ids"]) == 1

# The converter must have been called once per markdown file
mock_convert.assert_called_once()
# An antivirus scan is run for the uploaded image
assert mock_analyse_file.called


def test_outline_import_upload_invalid_zip_returns_validation_error():
"""Invalid archives are rejected with a validation error instead of crashing."""
user = factories.UserFactory()
client = APIClient()
client.force_login(user)

upload = SimpleUploadedFile(
name="export.zip",
content=b"not-a-zip",
content_type="application/zip",
)

response = client.post(
"/api/v1.0/imports/outline/upload",
{"file": upload},
format="multipart",
)

assert response.status_code == 400
assert response.json() == {"file": ["Invalid zip archive"]}


@patch("core.api.imports.process_outline_zip", side_effect=OutlineImportError("boom"))
def test_outline_import_upload_outline_error_returns_validation_error(mock_process_outline):
"""Service-level Outline import errors are surfaced as validation errors."""
user = factories.UserFactory()
client = APIClient()
client.force_login(user)

zip_bytes = make_zip_with_markdown_and_image(
md_path="doc.md",
md_content="# Title",
img_path="",
img_bytes=b"",
)
upload = SimpleUploadedFile(name="export.zip", content=zip_bytes, content_type="application/zip")

response = client.post(
"/api/v1.0/imports/outline/upload",
{"file": upload},
format="multipart",
)

assert response.status_code == 400
assert response.json() == {"file": ["boom"]}
mock_process_outline.assert_called_once()
Loading
Loading