feat: add flash-attn to reduce VRAM usage and speed up inference

johannhartmann · johannhartmann · commit 4452186e49b5 · 2026-03-10T15:14:17.000+01:00
Without flash-attention, eager attention materializes O(N²) matrices
for each layer. On high-res PDF pages this needs 7+ GB just for
activations, exceeding the MPS memory limit. Flash-attention reduces
this to O(N).
diff --git a/Containerfile.mayflower-qwen3vl b/Containerfile.mayflower-qwen3vl
@@ -9,8 +9,10 @@ USER 0
 ENV PIP_DISABLE_PIP_VERSION_CHECK=1
 
 RUN /opt/app-root/bin/python -m pip install --no-cache-dir \
+    flash-attn --no-build-isolation && \
+    /opt/app-root/bin/python -m pip install --no-cache-dir \
     "git+${QWEN3VL_PLUGIN_REPO}@${QWEN3VL_PLUGIN_REF}" && \
-    /opt/app-root/bin/python -c "from docling_ocr_qwen3vl.options import DEFAULT_QWEN3VL_MODEL_REPO_ID; print('default_model_repo_id=', DEFAULT_QWEN3VL_MODEL_REPO_ID)"
+    /opt/app-root/bin/python -c "import flash_attn; print('flash-attn', flash_attn.__version__); from docling_ocr_qwen3vl.options import DEFAULT_QWEN3VL_MODEL_REPO_ID; print('default_model_repo_id=', DEFAULT_QWEN3VL_MODEL_REPO_ID)"
 
 ENV DOCLING_SERVE_ALLOW_EXTERNAL_PLUGINS=true