From 2d268ffb6fcc4c0e09f9a1482204a2e2501123d4 Mon Sep 17 00:00:00 2001 From: jehlum11 Date: Mon, 23 Mar 2026 15:17:57 -0400 Subject: [PATCH 1/5] docs: add agent skill bundle with convert/evaluate helpers - Add docs/examples/agent_skill/docling-document-intelligence/ with SKILL.md, pipelines.md, EXAMPLE.md, improvement-log template, and scripts/docling-convert.py + docling-evaluate.py (standard/vlm-local/vlm-api). - Document InputFormat.PDF + PdfFormatOption for explicit PdfPipelineOptions. - Link from examples index and mkdocs nav. Made-with: Cursor --- .../docling-document-intelligence/EXAMPLE.md | 87 +++++ .../docling-document-intelligence/README.md | 36 ++ .../docling-document-intelligence/SKILL.md | 353 +++++++++++++++++ .../improvement-log.md | 20 + .../pipelines.md | 216 +++++++++++ .../scripts/docling-convert.py | 360 ++++++++++++++++++ .../scripts/docling-evaluate.py | 287 ++++++++++++++ .../scripts/requirements.txt | 4 + docs/examples/index.md | 1 + mkdocs.yml | 1 + 10 files changed, 1365 insertions(+) create mode 100644 docs/examples/agent_skill/docling-document-intelligence/EXAMPLE.md create mode 100644 docs/examples/agent_skill/docling-document-intelligence/README.md create mode 100644 docs/examples/agent_skill/docling-document-intelligence/SKILL.md create mode 100644 docs/examples/agent_skill/docling-document-intelligence/improvement-log.md create mode 100644 docs/examples/agent_skill/docling-document-intelligence/pipelines.md create mode 100644 docs/examples/agent_skill/docling-document-intelligence/scripts/docling-convert.py create mode 100644 docs/examples/agent_skill/docling-document-intelligence/scripts/docling-evaluate.py create mode 100644 docs/examples/agent_skill/docling-document-intelligence/scripts/requirements.txt diff --git a/docs/examples/agent_skill/docling-document-intelligence/EXAMPLE.md b/docs/examples/agent_skill/docling-document-intelligence/EXAMPLE.md new file mode 100644 index 0000000000..ab1ae8986a --- /dev/null +++ b/docs/examples/agent_skill/docling-document-intelligence/EXAMPLE.md @@ -0,0 +1,87 @@ +# Using the Docling agent skill + +[Agent Skills](https://agentskills.io/specification) are folders of instructions that AI coding agents (Cursor, Claude Code, GitHub Copilot, etc.) can load when relevant. This bundle lives in the Docling repo at: + +`docs/examples/agent_skill/docling-document-intelligence/` + +## Install (copy into your agent’s skills directory) + +```bash +# From a checkout of github.com/docling-project/docling +cp -r docs/examples/agent_skill/docling-document-intelligence ~/.cursor/skills/ +# or e.g. ~/.claude/skills/ depending on your tool +``` + +No extra config is required beyond installing Python dependencies (below). + +## Usage + +Open your agent-enabled IDE and ask, for example: + +``` +Parse report.pdf and give me a structural outline +``` + +``` +Convert https://arxiv.org/pdf/2408.09869 to markdown +``` + +``` +Chunk invoice.pdf for RAG ingestion with 512 token chunks +``` + +``` +Process scanned.pdf using the VLM pipeline +``` + +The agent should read `SKILL.md`, match the task, and run the appropriate pipeline. + +## Running the helper scripts directly + +From the **bundle root** (the `docling-document-intelligence` directory): + +```bash +pip install -r scripts/requirements.txt + +python3 scripts/docling-convert.py report.pdf + +python3 scripts/docling-convert.py report.pdf --ocr-engine rapidocr + +python3 scripts/docling-convert.py report.pdf --format chunks --max-tokens 512 + +python3 scripts/docling-convert.py scanned.pdf --pipeline vlm-local + +python3 scripts/docling-convert.py doc.pdf \ + --pipeline vlm-api \ + --vlm-api-url http://localhost:8000/v1/chat/completions \ + --vlm-api-model ibm-granite/granite-docling-258M +``` + +## Evaluate and refine + +```bash +python3 scripts/docling-convert.py report.pdf --format json --out /tmp/doc.json +python3 scripts/docling-convert.py report.pdf --format markdown --out /tmp/doc.md +python3 scripts/docling-evaluate.py /tmp/doc.json --markdown /tmp/doc.md +``` + +If the report shows `warn` or `fail`, follow `recommended_actions`, re-convert, +and optionally append a note to `improvement-log.md` (see `SKILL.md` section 6). + +## What the skill covers + +| Task | How to ask | +|---|---| +| Parse PDF / DOCX / PPTX / HTML / image | "parse this file" | +| Convert to Markdown | "convert to markdown" | +| Export as structured JSON | "export as JSON" | +| Chunk for RAG | "chunk for RAG", "prepare for ingestion" | +| Analyze structure | "show me the headings and tables" | +| Use VLM pipeline | "use the VLM pipeline", "process scanned PDF" | +| Use remote inference | "use vLLM", "call the API pipeline" | + +## Further reading + +- [Agent Skills specification](https://agentskills.io/specification) +- [Docling documentation](https://docling-project.github.io/docling/) +- [Docling GitHub](https://github.com/docling-project/docling) diff --git a/docs/examples/agent_skill/docling-document-intelligence/README.md b/docs/examples/agent_skill/docling-document-intelligence/README.md new file mode 100644 index 0000000000..7d077e75fd --- /dev/null +++ b/docs/examples/agent_skill/docling-document-intelligence/README.md @@ -0,0 +1,36 @@ +# Docling agent skill (Cursor & compatible assistants) + +This folder is an **[Agent Skill](https://agentskills.io/specification)**-style bundle for AI coding assistants: structured instructions (`SKILL.md`), a pipeline reference (`pipelines.md`), optional helper scripts under `scripts/`, and an evaluator for conversion quality. + +It complements the official [Docling documentation](https://docling-project.github.io/docling/) and the [`docling` CLI](https://docling-project.github.io/docling/reference/cli/); use it when you want agents to follow a consistent **convert → export JSON → evaluate → refine** workflow. + +## Contents + +| Path | Purpose | +|------|---------| +| [`SKILL.md`](SKILL.md) | Full skill instructions (pipelines, chunking, evaluation loop) | +| [`pipelines.md`](pipelines.md) | Standard vs VLM pipelines, OCR engines, API notes | +| [`EXAMPLE.md`](EXAMPLE.md) | Copying the skill into `~/.cursor/skills/` or similar; running scripts | +| [`improvement-log.md`](improvement-log.md) | Optional template for local “what worked” notes | +| [`scripts/docling-convert.py`](scripts/docling-convert.py) | CLI: Markdown / JSON / RAG chunks | +| [`scripts/docling-evaluate.py`](scripts/docling-evaluate.py) | Heuristic quality report on JSON (+ optional Markdown) | +| [`scripts/requirements.txt`](scripts/requirements.txt) | Minimal pip deps for the scripts | + +## Quick start (from this directory) + +```bash +pip install -r scripts/requirements.txt +python3 scripts/docling-convert.py https://arxiv.org/pdf/2408.09869 --out /tmp/out.md +python3 scripts/docling-convert.py https://arxiv.org/pdf/2408.09869 --format json --out /tmp/out.json +python3 scripts/docling-evaluate.py /tmp/out.json --markdown /tmp/out.md +``` + +Use `--pipeline vlm-local` or `--pipeline vlm-api` for vision-model pipelines; see `SKILL.md` and `pipelines.md`. + +## Using as a Cursor / Claude skill + +Copy the folder `docling-document-intelligence` into your tool’s skills directory (see [`EXAMPLE.md`](EXAMPLE.md)). The `SKILL.md` frontmatter describes when the skill should activate. + +## License + +Contributed under the same terms as the [Docling](https://github.com/docling-project/docling) repository (MIT). diff --git a/docs/examples/agent_skill/docling-document-intelligence/SKILL.md b/docs/examples/agent_skill/docling-document-intelligence/SKILL.md new file mode 100644 index 0000000000..9c5ca1b1d7 --- /dev/null +++ b/docs/examples/agent_skill/docling-document-intelligence/SKILL.md @@ -0,0 +1,353 @@ +--- +name: docling-document-intelligence +description: > + Parse, convert, chunk, and analyze documents using Docling. Use this skill + when the user provides a document (PDF, DOCX, PPTX, HTML, image) as a file + path or URL and wants to: extract text or structured content, convert to + Markdown or JSON, chunk the document for RAG ingestion, analyze document + structure (headings, tables, figures, reading order), or run quality + evaluation with iterative pipeline tuning. Triggers: "parse this PDF", + "convert to markdown", "chunk for RAG", "extract tables", "analyze document + structure", "prepare for ingestion", "process document", "evaluate docling + output", "improve conversion quality". +license: MIT +compatibility: Requires Python 3.10+, docling>=2.81.0, docling-core>=2.67.1 +metadata: + author: docling-project + version: "1.4" + upstream: https://github.com/docling-project/docling +allowed-tools: Bash(python3:*) Bash(pip:*) +--- + +# Docling Document Intelligence Skill + +Use this skill to parse, convert, chunk, and analyze documents with Docling. +It handles both local file paths and URLs, and outputs either Markdown or +structured JSON (`DoclingDocument`). + +## Scope + +| Task | Covered | +|---|---| +| Parse PDF / DOCX / PPTX / HTML / image | ✅ | +| Convert to Markdown | ✅ | +| Export as DoclingDocument JSON | ✅ | +| Chunk for RAG (hybrid: heading + token) | ✅ | +| Analyze structure (headings, tables, figures) | ✅ | +| OCR for scanned PDFs | ✅ (auto-enabled) | +| Multi-source batch conversion | ✅ | + +## Step-by-Step Instructions + +### 1. Resolve the input + +Determine whether the user supplied a **local path** or a **URL**. + +- Local path → pass as `str` or `Path` directly to `DocumentConverter` +- URL → pass as `str`; Docling fetches it automatically +- Multiple inputs → pass a list + +```python +sources = ["path/to/file.pdf"] # local +sources = ["https://example.com/a.pdf"] # URL +sources = ["file1.pdf", "file2.docx"] # batch +``` + +### 2. Choose a pipeline + +Docling has three pipelines. Pick based on document type and hardware. + +| Pipeline | Best for | Key tradeoff | +|---|---|---| +| **Standard** (default) | Born-digital PDFs, speed | No GPU needed; OCR for scanned pages | +| **VLM local** | Complex layouts, handwriting, formulas | Needs GPU; slower | +| **VLM API** | Production scale, remote inference | Requires inference server | + +See [pipelines.md](pipelines.md) for the full decision matrix, OCR engine table +(EasyOCR, RapidOCR, Tesseract, macOS; Tesseract CLI and future engines such as +Nemotron in Python only when supported by your Docling version), and VLM presets. + +### 3. Convert the document + +**Docling 2.81+ API note:** `DocumentConverter(format_options=...)` expects +`dict[InputFormat, FormatOption]` (e.g. `InputFormat.PDF` → `PdfFormatOption`). +Using string keys like `{"pdf": PdfPipelineOptions(...)}` fails at runtime with +`AttributeError: 'PdfPipelineOptions' object has no attribute 'backend'`. + +**Standard pipeline (default):** +```python +from docling.document_converter import DocumentConverter, PdfFormatOption +from docling.datamodel.base_models import InputFormat +from docling.datamodel.pipeline_options import PdfPipelineOptions + +# Defaults: standard PDF pipeline, OCR + tables +converter = DocumentConverter() +result = converter.convert(sources[0]) + +# Custom PdfPipelineOptions (same API as scripts/docling-convert.py --pipeline standard) +converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_options=PdfPipelineOptions(do_ocr=True, do_table_structure=True), + ), + } +) +result = converter.convert(sources[0]) +``` + +**VLM pipeline — local (GraniteDocling via HF Transformers):** +```python +from docling.document_converter import DocumentConverter, PdfFormatOption +from docling.datamodel.base_models import InputFormat +from docling.datamodel.pipeline_options import VlmPipelineOptions +from docling.datamodel import vlm_model_specs +from docling.pipeline.vlm_pipeline import VlmPipeline + +pipeline_options = VlmPipelineOptions( + vlm_options=vlm_model_specs.GRANITEDOCLING_TRANSFORMERS, + generate_page_images=True, +) +converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_cls=VlmPipeline, + pipeline_options=pipeline_options, + ) + } +) +result = converter.convert(sources[0]) +``` + +**VLM pipeline — remote API (vLLM / LM Studio / Ollama):** +```python +from docling.document_converter import DocumentConverter, PdfFormatOption +from docling.datamodel.base_models import InputFormat +from docling.datamodel.pipeline_options import VlmPipelineOptions +from docling.datamodel.pipeline_options_vlm_model import ApiVlmOptions, ResponseFormat +from docling.pipeline.vlm_pipeline import VlmPipeline + +vlm_opts = ApiVlmOptions( + url="http://localhost:8000/v1/chat/completions", + params=dict(model="ibm-granite/granite-docling-258M", max_tokens=4096), + prompt="Convert this page to docling.", + response_format=ResponseFormat.DOCTAGS, + timeout=120, +) +pipeline_options = VlmPipelineOptions( + vlm_options=vlm_opts, + generate_page_images=True, + enable_remote_services=True, # required — gates all outbound HTTP +) +converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_cls=VlmPipeline, + pipeline_options=pipeline_options, + ) + } +) +result = converter.convert(sources[0]) +``` + +`result.document` is a `DoclingDocument` object in all three cases. + +### 3. Choose output format + +**Markdown** (default, human-readable): +```python +md = result.document.export_to_markdown() +``` + +**JSON / DoclingDocument** (structured, lossless): +```python +import json +doc_json = result.document.model_dump() # dict +doc_json_str = result.document.export_to_dict() # serialisable dict +``` + +> If the user does not specify a format, ask: "Should I output Markdown or +> structured JSON (DoclingDocument)?" + +### 4. Chunk for RAG (hybrid strategy) + +Default: **hybrid chunker** — splits first by heading hierarchy, then +subdivides oversized sections by token count. This preserves semantic +boundaries while respecting model context limits. + +The tokenizer API changed in docling-core 2.8.0. Pass a `BaseTokenizer` +object, not a raw string: + +**HuggingFace tokenizer (default):** +```python +from docling.chunking import HybridChunker +from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer + +tokenizer = HuggingFaceTokenizer.from_pretrained( + model_name="sentence-transformers/all-MiniLM-L6-v2", + max_tokens=512, +) +chunker = HybridChunker(tokenizer=tokenizer, merge_peers=True) +chunks = list(chunker.chunk(result.document)) + +for chunk in chunks: + # contextualize() is the correct method for embedding-ready text — + # it enriches chunk.text with heading breadcrumb metadata + embed_text = chunker.contextualize(chunk) + print(chunk.meta.headings) # heading breadcrumb list + print(chunk.meta.origin.page_no) # source page number +``` + +**OpenAI tokenizer (for OpenAI embedding models):** +```python +import tiktoken +from docling_core.transforms.chunker.tokenizer.openai import OpenAITokenizer + +tokenizer = OpenAITokenizer( + tokenizer=tiktoken.encoding_for_model("text-embedding-3-small"), + max_tokens=8192, +) +# Requires: pip install 'docling-core[chunking-openai]' +``` + +For chunking strategies and tokenizer details, see the Docling documentation +on chunking and `HybridChunker`. + +### 5. Analyze document structure + +Use the `DoclingDocument` object directly to inspect structure: + +```python +doc = result.document + +# Iterate headings +for item, level in doc.iterate_items(): + if hasattr(item, 'label') and item.label.name == 'SECTION_HEADER': + print(f"{'#' * level} {item.text}") + +# Extract tables +for table in doc.tables: + print(table.export_to_dataframe()) # pandas DataFrame + print(table.export_to_markdown()) + +# Extract figures / images +for picture in doc.pictures: + print(picture.caption_text(doc)) # caption if present +``` + +For the full API surface, see Docling’s structure and table export docs. + +### 6. Evaluate output and iterate (required for “best effort” conversions) + +After **every** conversion where the user cares about fidelity (not quick +previews), run the bundled evaluator on the JSON export, then refine the +pipeline if needed. This is how the agent **checks its work** and **improves +the run** without guessing. + +**Step A — Produce JSON and optional Markdown** + +```bash +# From the bundle root (directory containing scripts/ and SKILL.md): +python3 scripts/docling-convert.py "" --format json --out /tmp/docling-out.json +python3 scripts/docling-convert.py "" --format markdown --out /tmp/docling-out.md +``` + +**Step B — Evaluate** + +```bash +python3 scripts/docling-evaluate.py /tmp/docling-out.json --markdown /tmp/docling-out.md +``` + +If the user expects tables (invoices, spreadsheets in PDF), add +`--expect-tables`. Tighten gates with `--fail-on-warn` in CI-style checks. + +The script prints a JSON report to stdout: `status` (`pass` | `warn` | `fail`), +`metrics`, `issues`, and `recommended_actions` (concrete `scripts/docling-convert.py` +flags to try next). + +**Step C — Refinement loop (max 3 attempts unless the user says otherwise)** + +1. If `status` is `warn` or `fail`, apply **one** primary change from + `recommended_actions` (e.g. switch standard → VLM, change OCR engine, + ensure tables are enabled, hybrid `--force-backend-text`). +2. Re-convert, re-export JSON, re-run `scripts/docling-evaluate.py`. +3. Stop when `status` is `pass`, or after 3 iterations — then summarize what + worked and any remaining issues for the user. + +**Step D — Self-improvement log (skill memory)** + +After a successful pass **or** after the final iteration, append one entry to +[improvement-log.md](improvement-log.md) in this skill directory: + +- Source type (e.g. scanned PDF, digital PDF, DOCX) +- First-run problems (from `issues`) +- Pipeline + flags that fixed or best mitigated them +- Final `status` and one line of subjective quality notes + +This log is optional for the user to git-ignore; it is for **local** learning +so future runs on similar documents start closer to the right pipeline. + +### 7. Agent quality checklist (manual, if script unavailable) + +If `scripts/docling-evaluate.py` cannot run, still verify: + +| Check | Action if bad | +|---|---| +| Page count matches source (roughly) | Re-run; try VLM if layout is complex | +| Markdown is not near-empty | Enable OCR / VLM | +| Tables missing when visually obvious | Enable table structure; try VLM | +| `\ufffd` replacement characters | Different OCR or VLM | +| Same line repeated many times | VLM or hybrid `--force-backend-text` | + +## Common Edge Cases + +| Situation | Handling | +|---|---| +| Scanned / image-only PDF | Standard pipeline with OCR, or VLM pipeline for best quality | +| Password-protected PDF | Will raise `ConversionError`; surface to user | +| Very large document (500+ pages) | Standard pipeline with `do_table_structure=False` for speed | +| Complex layout / multi-column | Prefer VLM pipeline; standard may misorder reading flow | +| Handwriting or formulas | VLM pipeline only — standard OCR will not handle these | +| URL behind auth | Pre-download to temp file; pass local path | +| Tables with merged cells | `table.export_to_markdown()` handles spans; VLM pipeline often more accurate | +| Non-UTF-8 encoding | Docling normalises internally; no special handling needed | +| VLM hallucinating text | Set `force_backend_text=True` for hybrid mode (PDF text + VLM layout) | +| VLM API call blocked | `enable_remote_services=True` is mandatory on `VlmPipelineOptions` | +| Apple Silicon | Use `GRANITEDOCLING_MLX` preset for MPS acceleration | + +## Pipeline reference + +Full decision matrix, all OCR engine options, VLM model presets, and API +server configuration: [pipelines.md](pipelines.md) + +## Output conventions + +- Always report the number of pages and conversion status. +- When evaluation is in scope, report evaluator `status`, top `issues`, and + which refinement attempt produced the final output. +- For Markdown output: wrap in a fenced code block only if the user will copy/paste it; otherwise render directly. +- For JSON output: pretty-print with `indent=2` unless the user specifies otherwise. +- For chunks: report total chunk count, min/max/avg token counts. +- For structure analysis: summarise heading tree + table count + figure count before going into detail. + +## Dependencies + +Install from the bundled requirements file (always pulls latest compatible): + +```bash +pip install -r scripts/requirements.txt +``` + +Or manually: + +```bash +pip install docling docling-core +# For OpenAI tokenizer support: +pip install 'docling-core[chunking-openai]' +``` + +Check installed versions (prefer distribution metadata — `docling` may not set `__version__`): + +```python +from importlib.metadata import version +print(version("docling"), version("docling-core")) +``` diff --git a/docs/examples/agent_skill/docling-document-intelligence/improvement-log.md b/docs/examples/agent_skill/docling-document-intelligence/improvement-log.md new file mode 100644 index 0000000000..092c4043d2 --- /dev/null +++ b/docs/examples/agent_skill/docling-document-intelligence/improvement-log.md @@ -0,0 +1,20 @@ +# Docling agent skill — improvement log + +Agents may append a short entry after running **evaluate → refine** on a document +so similar files are faster to process next time. This file is optional and is +not tracked by every user; it is meant for **local** learning. + +## Template (copy for each entry) + +```markdown +### YYYY-MM-DD — +- **Source type:** (e.g. scanned PDF / digital PDF / DOCX / URL) +- **Issues (first run):** … +- **Pipeline / flags that helped:** … +- **Final evaluator status:** pass | warn | fail +- **Notes:** … +``` + +## Entries + +_(None — add your own after running conversions.)_ diff --git a/docs/examples/agent_skill/docling-document-intelligence/pipelines.md b/docs/examples/agent_skill/docling-document-intelligence/pipelines.md new file mode 100644 index 0000000000..e50d8af140 --- /dev/null +++ b/docs/examples/agent_skill/docling-document-intelligence/pipelines.md @@ -0,0 +1,216 @@ +# Docling Pipelines Reference + +Docling has two pipeline families for PDFs: **standard** (parse + OCR + layout/tables) +and **VLM** (page images through a vision-language model). The helper +`scripts/docling-convert.py` exposes **three modes**: `standard`, `vlm-local`, `vlm-api`. +The right choice depends on document type, hardware, and latency budget. + +--- + +## Decision matrix + +| Document type | Recommended pipeline | Reason | +|---|---|---| +| Born-digital PDF (text selectable) | Standard | Fast, accurate, no GPU needed | +| Scanned PDF / image-only | Standard + OCR or VLM | Depends on quality | +| Complex layout (multi-column, dense tables) | VLM local | Better structural understanding | +| Handwriting, formulas, figures with embedded text | VLM | Only viable option | +| Air-gapped / no GPU | Standard | Runs on CPU | +| Production scale, GPU server available | VLM API (vLLM) | Best throughput | +| Apple Silicon / local dev | VLM local (MLX) | MPS acceleration | +| Speed-critical, accuracy secondary | Standard, no tables | Fastest path | + +--- + +## Pipeline 1: Standard PDF Pipeline + +Uses deterministic PDF parsing (docling-parse) + optional neural OCR + neural +table structure detection. + +```python +from docling.document_converter import DocumentConverter, PdfFormatOption +from docling.datamodel.base_models import InputFormat +from docling.datamodel.pipeline_options import PdfPipelineOptions + +# Minimal — library defaults (standard PDF pipeline) +converter = DocumentConverter() + +# Explicit PdfPipelineOptions (docling 2.81+): use InputFormat.PDF + PdfFormatOption. +# Do not use format_options={"pdf": opts}; that raises AttributeError on pipeline options. +opts = PdfPipelineOptions( + do_ocr=True, # False = skip OCR entirely + do_table_structure=True, # False = skip table detection (faster) +) +converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption(pipeline_options=opts), + } +) +``` + +### OCR engine options + +All engines are plug-and-play via `ocr_options`. Default is EasyOCR. + +```python +# EasyOCR (default — no extra install needed) +from docling.datamodel.pipeline_options import PdfPipelineOptions +opts = PdfPipelineOptions(do_ocr=True) # uses EasyOCR by default + +# Tesseract (requires system Tesseract + pip install tesserocr — see Docling install docs) +from docling.datamodel.pipeline_options import TesseractOcrOptions +opts = PdfPipelineOptions(do_ocr=True, ocr_options=TesseractOcrOptions()) + +# RapidOCR (lightweight, no C deps) +from docling.datamodel.pipeline_options import RapidOcrOptions +opts = PdfPipelineOptions(do_ocr=True, ocr_options=RapidOcrOptions()) + +# macOS native OCR +from docling.datamodel.pipeline_options import OcrMacOptions +opts = PdfPipelineOptions(do_ocr=True, ocr_options=OcrMacOptions()) +``` + +### Standard pipeline + OCR: CLI vs Python + +`scripts/docling-convert.py` (`--pipeline standard`) maps engines like this: + +| Engine | CLI | Notes | +|--------|-----|--------| +| EasyOCR | `--ocr-engine easyocr` (default) | No extra pip beyond docling defaults | +| RapidOCR | `--ocr-engine rapidocr` | Lightweight; see Docling notes on read-only FS | +| Tesseract | `--ocr-engine tesseract` | Uses `TesseractOcrOptions` → needs **`pip install tesserocr`** and system Tesseract | +| macOS Vision | `--ocr-engine mac` | `OcrMacOptions` | + +**Tesseract without `tesserocr`:** Docling also provides `TesseractCliOcrOptions` (shell out to the `tesseract` binary). This helper CLI does not expose it yet; set `ocr_options=TesseractCliOcrOptions()` in Python if you only have the CLI installed. + +**NVIDIA Nemotron OCR:** Not exposed in `docling-convert.py` and not present in docling **2.81.x** `pipeline_options` on PyPI. If a future Docling release adds a Nemotron options class, configure `PdfPipelineOptions(ocr_options=...)` in Python (see [pipeline options](https://docling-project.github.io/docling/reference/pipeline_options/) for your installed version) or extend the CLI. + +--- + +## Pipeline 2: VLM Pipeline — local inference + +Processes each page as an image through a vision-language model. Replaces the +standard layout detection + OCR stack entirely. + +```python +from docling.document_converter import DocumentConverter, PdfFormatOption +from docling.datamodel.base_models import InputFormat +from docling.datamodel.pipeline_options import VlmPipelineOptions +from docling.datamodel import vlm_model_specs +from docling.pipeline.vlm_pipeline import VlmPipeline + +pipeline_options = VlmPipelineOptions( + vlm_options=vlm_model_specs.GRANITEDOCLING_TRANSFORMERS, + generate_page_images=True, +) + +converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_cls=VlmPipeline, + pipeline_options=pipeline_options, + ) + } +) +``` + +### Available model presets (`vlm_model_specs`) + +| Preset | Model | Backend | Device | Notes | +|---|---|---|---|---| +| `GRANITEDOCLING_TRANSFORMERS` | granite-docling-258M | HF Transformers | CPU/GPU | Default | +| `SMOLDOCLING_TRANSFORMERS` | smoldocling-256M | HF Transformers | CPU/GPU | Lighter | +| `GRANITEDOCLING_VLLM` | granite-docling-258M | vLLM | GPU | Fast batch | +| `GRANITEDOCLING_MLX` | granite-docling-258M-mlx | MLX | Apple MPS | M-series Macs | + +### Hybrid mode: PDF text + VLM for images/tables + +Set `force_backend_text=True` to use deterministic text extraction for normal +text regions while routing images and tables through the VLM. Reduces +hallucination risk on text-heavy pages. + +```python +pipeline_options = VlmPipelineOptions( + vlm_options=vlm_model_specs.GRANITEDOCLING_TRANSFORMERS, + force_backend_text=True, # <-- hybrid mode + generate_page_images=True, +) +``` + +--- + +## Pipeline 3: VLM Pipeline — remote API + +Sends page images to any OpenAI-compatible endpoint. Works with vLLM, +LM Studio, Ollama, or a hosted model API. + +```python +from docling.document_converter import DocumentConverter, PdfFormatOption +from docling.datamodel.base_models import InputFormat +from docling.datamodel.pipeline_options import VlmPipelineOptions +from docling.datamodel.pipeline_options_vlm_model import ApiVlmOptions, ResponseFormat +from docling.pipeline.vlm_pipeline import VlmPipeline + +vlm_opts = ApiVlmOptions( + url="http://localhost:8000/v1/chat/completions", + params=dict( + model="ibm-granite/granite-docling-258M", + max_tokens=4096, + ), + headers={"Authorization": "Bearer YOUR_KEY"}, # omit if not needed + prompt="Convert this page to docling.", + response_format=ResponseFormat.DOCTAGS, + timeout=120, + scale=2.0, +) + +pipeline_options = VlmPipelineOptions( + vlm_options=vlm_opts, + generate_page_images=True, + enable_remote_services=True, # required — gates any HTTP call +) + +converter = DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_cls=VlmPipeline, + pipeline_options=pipeline_options, + ) + } +) +``` + +**`enable_remote_services=True` is mandatory** for API pipelines. Docling +blocks outbound HTTP by default as a safety measure. + +### Common API targets + +| Server | Default URL | Notes | +|---|---|---| +| vLLM | `http://localhost:8000/v1/chat/completions` | Best throughput | +| LM Studio | `http://localhost:1234/v1/chat/completions` | Local dev | +| Ollama | `http://localhost:11434/v1/chat/completions` | Model: `ibm/granite-docling:258m` | +| OpenAI-compatible cloud | Provider URL | Set Authorization header | + +--- + +## VLM install requirements + +Local inference requires PyTorch + Transformers: + +```bash +pip install docling[vlm] +# or manually: +pip install torch transformers accelerate +``` + +MLX (Apple Silicon only): +```bash +pip install mlx mlx-lm +``` + +vLLM backend (server-side): +```bash +pip install vllm +vllm serve ibm-granite/granite-docling-258M +``` diff --git a/docs/examples/agent_skill/docling-document-intelligence/scripts/docling-convert.py b/docs/examples/agent_skill/docling-document-intelligence/scripts/docling-convert.py new file mode 100644 index 0000000000..efc7fa3f67 --- /dev/null +++ b/docs/examples/agent_skill/docling-document-intelligence/scripts/docling-convert.py @@ -0,0 +1,360 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: MIT +""" +docling-convert.py — Parse a document and emit Markdown, JSON, or RAG chunks. + +Requires: docling>=2.81.0, docling-core>=2.67.1, packaging +Install: pip install -r scripts/requirements.txt (from the bundle root directory) + +Usage (from bundle root, i.e. docling-document-intelligence/): + python3 scripts/docling-convert.py [options] + +Arguments: + source Local file path or URL (required) + +Pipeline selection: + --pipeline standard | vlm-local | vlm-api (default: standard) + +Standard pipeline options: + --ocr-engine easyocr | tesseract | rapidocr | mac (default: easyocr) + --no-ocr Disable OCR entirely + --no-tables Skip table structure parsing (faster) + +VLM local pipeline options (--pipeline vlm-local): + --vlm-model granitedocling | smoldocling | granitedocling-vllm | granitedocling-mlx + (default: granitedocling) + --force-backend-text + Hybrid mode: use PDF text extraction for text, VLM for images/tables + +VLM API pipeline options (--pipeline vlm-api): + --vlm-api-url OpenAI-compatible endpoint (e.g. http://localhost:8000/v1/chat/completions) + --vlm-api-model Model name on the server (e.g. ibm-granite/granite-docling-258M) + --vlm-api-key API key if required (default: none) + +Output options: + --format markdown | json | chunks (default: markdown) + --max-tokens Max tokens per chunk (default: 512) + --tokenizer HuggingFace model id for chunking + --openai-model Use OpenAI tiktoken tokenizer for chunking + Requires: pip install 'docling-core[chunking-openai]' + --out Write output to file instead of stdout +""" + +import argparse +import json +import sys +from pathlib import Path + +MIN_DOCLING_VERSION = "2.81.0" +MIN_DOCLING_CORE_VERSION = "2.67.1" + + +def parse_args(): + p = argparse.ArgumentParser(description="Docling document converter") + p.add_argument("source", help="File path or URL") + + p.add_argument( + "--pipeline", choices=["standard", "vlm-local", "vlm-api"], default="standard" + ) + + p.add_argument( + "--ocr-engine", + choices=["easyocr", "tesseract", "rapidocr", "mac"], + default="easyocr", + ) + p.add_argument("--no-ocr", action="store_true") + p.add_argument("--no-tables", action="store_true") + + p.add_argument( + "--vlm-model", + choices=[ + "granitedocling", + "smoldocling", + "granitedocling-vllm", + "granitedocling-mlx", + ], + default="granitedocling", + ) + p.add_argument( + "--force-backend-text", + action="store_true", + help="Hybrid: PDF text for text regions, VLM for images/tables", + ) + + p.add_argument("--vlm-api-url", default="http://localhost:8000/v1/chat/completions") + p.add_argument("--vlm-api-model", default="ibm-granite/granite-docling-258M") + p.add_argument("--vlm-api-key", default=None) + + p.add_argument( + "--format", choices=["markdown", "json", "chunks"], default="markdown" + ) + p.add_argument("--max-tokens", type=int, default=512) + p.add_argument("--tokenizer", default="sentence-transformers/all-MiniLM-L6-v2") + p.add_argument("--openai-model", default=None) + p.add_argument("--out", default=None) + return p.parse_args() + + +def check_dependencies(): + from importlib.metadata import PackageNotFoundError, version as dist_version + + from packaging.version import Version + + missing: list[str] = [] + checks = [ + ("docling", "docling", MIN_DOCLING_VERSION), + ("docling_core", "docling-core", MIN_DOCLING_CORE_VERSION), + ] + for import_name, dist_name, min_ver in checks: + try: + __import__(import_name) + except ImportError: + missing.append(dist_name) + continue + try: + ver = dist_version(dist_name) + except PackageNotFoundError: + ver = "0.0.0" + if Version(ver) < Version(min_ver): + print( + f"WARNING: {dist_name}>={min_ver} recommended, found {ver}. " + f"Run: pip install --upgrade {dist_name}", + file=sys.stderr, + ) + if missing: + print( + f"ERROR: missing packages: {' '.join(missing)}\n" + f"Run: pip install -r scripts/requirements.txt (from the bundle root directory)", + file=sys.stderr, + ) + sys.exit(1) + + +def build_standard_converter(args): + from docling.datamodel.base_models import InputFormat + from docling.datamodel.pipeline_options import PdfPipelineOptions + from docling.document_converter import DocumentConverter, PdfFormatOption + + ocr_opts = None + if not args.no_ocr: + engine = args.ocr_engine + if engine == "tesseract": + from docling.datamodel.pipeline_options import TesseractOcrOptions + + ocr_opts = TesseractOcrOptions() + elif engine == "rapidocr": + from docling.datamodel.pipeline_options import RapidOcrOptions + + ocr_opts = RapidOcrOptions() + elif engine == "mac": + from docling.datamodel.pipeline_options import OcrMacOptions + + ocr_opts = OcrMacOptions() + + kwargs = dict( + do_ocr=not args.no_ocr, + do_table_structure=not args.no_tables, + ) + if ocr_opts is not None: + kwargs["ocr_options"] = ocr_opts + + pipeline_options = PdfPipelineOptions(**kwargs) + return DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options), + } + ) + + +def build_vlm_local_converter(args): + from docling.datamodel import vlm_model_specs + from docling.datamodel.base_models import InputFormat + from docling.datamodel.pipeline_options import VlmPipelineOptions + from docling.document_converter import DocumentConverter, PdfFormatOption + from docling.pipeline.vlm_pipeline import VlmPipeline + + model_map = { + "granitedocling": vlm_model_specs.GRANITEDOCLING_TRANSFORMERS, + "smoldocling": vlm_model_specs.SMOLDOCLING_TRANSFORMERS, + "granitedocling-vllm": vlm_model_specs.GRANITEDOCLING_VLLM, + "granitedocling-mlx": vlm_model_specs.GRANITEDOCLING_MLX, + } + vlm_opts = model_map[args.vlm_model] + + pipeline_options = VlmPipelineOptions( + vlm_options=vlm_opts, + generate_page_images=True, + force_backend_text=args.force_backend_text, + ) + + return DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_cls=VlmPipeline, + pipeline_options=pipeline_options, + ) + } + ) + + +def build_vlm_api_converter(args): + from docling.datamodel.base_models import InputFormat + from docling.datamodel.pipeline_options import VlmPipelineOptions + from docling.datamodel.pipeline_options_vlm_model import ( + ApiVlmOptions, + ResponseFormat, + ) + from docling.document_converter import DocumentConverter, PdfFormatOption + from docling.pipeline.vlm_pipeline import VlmPipeline + + headers = {} + if args.vlm_api_key: + headers["Authorization"] = f"Bearer {args.vlm_api_key}" + + vlm_opts = ApiVlmOptions( + url=args.vlm_api_url, + params=dict( + model=args.vlm_api_model, + max_tokens=4096, + ), + headers=headers if headers else None, + prompt="Convert this page to docling.", + response_format=ResponseFormat.DOCTAGS, + timeout=120, + scale=2.0, + ) + + pipeline_options = VlmPipelineOptions( + vlm_options=vlm_opts, + generate_page_images=True, + force_backend_text=args.force_backend_text, + enable_remote_services=True, + ) + + return DocumentConverter( + format_options={ + InputFormat.PDF: PdfFormatOption( + pipeline_cls=VlmPipeline, + pipeline_options=pipeline_options, + ) + } + ) + + +def build_tokenizer(hf_model_id: str, openai_model, max_tokens: int): + if openai_model: + try: + import tiktoken + from docling_core.transforms.chunker.tokenizer.openai import OpenAITokenizer + except ImportError: + print( + "ERROR: OpenAI tokenizer requires:\n" + " pip install 'docling-core[chunking-openai]'", + file=sys.stderr, + ) + sys.exit(1) + return OpenAITokenizer( + tokenizer=tiktoken.encoding_for_model(openai_model), + max_tokens=max_tokens, + ) + from docling_core.transforms.chunker.tokenizer.huggingface import ( + HuggingFaceTokenizer, + ) + + return HuggingFaceTokenizer.from_pretrained( + model_name=hf_model_id, + max_tokens=max_tokens, + ) + + +def output_markdown(doc) -> str: + return doc.export_to_markdown() + + +def output_json(doc) -> str: + return json.dumps(doc.export_to_dict(), indent=2, ensure_ascii=False) + + +def output_chunks(doc, tokenizer) -> str: + from docling.chunking import HybridChunker + + chunker = HybridChunker(tokenizer=tokenizer, merge_peers=True) + chunks = list(chunker.chunk(doc)) + texts = [chunker.contextualize(c) for c in chunks] + + try: + counts = [tokenizer.count_tokens(t) for t in texts] + stats = ( + f"chunks={len(chunks)} " + f"min={min(counts)} max={max(counts)} " + f"avg={sum(counts) // len(counts)}" + ) + except Exception: + stats = f"chunks={len(chunks)}" + + lines = [f"# Chunks ({stats})", ""] + for i, (chunk, text) in enumerate(zip(chunks, texts)): + headings = ( + " > ".join(chunk.meta.headings) if chunk.meta.headings else "(no heading)" + ) + lines += [f"## Chunk {i + 1} | {headings}", "", text, "", "---", ""] + return "\n".join(lines) + + +def page_count(doc) -> int: + pages = set() + for item, _ in doc.iterate_items(): + for prov in getattr(item, "prov", []): + pages.add(prov.page_no) + return len(pages) + + +def main(): + args = parse_args() + check_dependencies() + + tokenizer = None + if args.format == "chunks": + tokenizer = build_tokenizer(args.tokenizer, args.openai_model, args.max_tokens) + + if args.pipeline == "standard": + converter = build_standard_converter(args) + print( + f"Pipeline: standard (ocr={not args.no_ocr}, engine={args.ocr_engine})", + file=sys.stderr, + ) + elif args.pipeline == "vlm-local": + converter = build_vlm_local_converter(args) + print( + f"Pipeline: vlm-local (model={args.vlm_model}, " + f"force_backend_text={args.force_backend_text})", + file=sys.stderr, + ) + elif args.pipeline == "vlm-api": + converter = build_vlm_api_converter(args) + print( + f"Pipeline: vlm-api (url={args.vlm_api_url}, model={args.vlm_api_model})", + file=sys.stderr, + ) + + print(f"Converting: {args.source}", file=sys.stderr) + result = converter.convert(args.source) + doc = result.document + print(f"Pages processed: {page_count(doc)}", file=sys.stderr) + + if args.format == "markdown": + output = output_markdown(doc) + elif args.format == "json": + output = output_json(doc) + else: + output = output_chunks(doc, tokenizer) + + if args.out: + Path(args.out).write_text(output, encoding="utf-8") + print(f"Written to {args.out}", file=sys.stderr) + else: + print(output) + + +if __name__ == "__main__": + main() diff --git a/docs/examples/agent_skill/docling-document-intelligence/scripts/docling-evaluate.py b/docs/examples/agent_skill/docling-document-intelligence/scripts/docling-evaluate.py new file mode 100644 index 0000000000..534b9eff81 --- /dev/null +++ b/docs/examples/agent_skill/docling-document-intelligence/scripts/docling-evaluate.py @@ -0,0 +1,287 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: MIT +""" +Evaluate a Docling JSON export and suggest pipeline / option changes. + +Typical flow (agent or human), from bundle root: + + python3 scripts/docling-convert.py input.pdf --format json --out doc.json + python3 scripts/docling-evaluate.py doc.json [--markdown out.md] + +Exit codes: 0 = pass; 1 = fail or --fail-on-warn with status warn +""" + +from __future__ import annotations + +import argparse +import json +import sys +from collections import Counter +from pathlib import Path +from typing import Any + + +def load_document(path: Path): + data = json.loads(path.read_text(encoding="utf-8")) + try: + from docling_core.types.doc.document import DoclingDocument + + return DoclingDocument.model_validate(data), data + except Exception: + return None, data + + +def page_numbers_from_doc(doc) -> set[int]: + pages: set[int] = set() + for item, _ in doc.iterate_items(): + for prov in getattr(item, "prov", None) or []: + p = getattr(prov, "page_no", None) + if p is not None: + pages.add(int(p)) + return pages + + +def collect_text_samples(doc, limit: int = 200) -> list[str]: + texts: list[str] = [] + for item, _ in doc.iterate_items(): + t = getattr(item, "text", None) + if t and str(t).strip(): + texts.append(str(t).strip()) + if len(texts) >= limit: + break + return texts + + +def metrics_from_doc(doc) -> dict[str, Any]: + n_tables = len(getattr(doc, "tables", []) or []) + n_pictures = len(getattr(doc, "pictures", []) or []) + n_headers = 0 + n_text_items = 0 + total_chars = 0 + for item, _ in doc.iterate_items(): + label = getattr(getattr(item, "label", None), "name", None) or "" + if label == "SECTION_HEADER": + n_headers += 1 + t = getattr(item, "text", None) + if t: + n_text_items += 1 + total_chars += len(str(t)) + + pages = page_numbers_from_doc(doc) + n_pages = len(pages) if pages else 0 + density = (total_chars / n_pages) if n_pages else total_chars + + samples = collect_text_samples(doc) + rep = Counter(samples) + top_rep = rep.most_common(1)[0] if rep else ("", 0) + dup_ratio = ( + sum(c for _, c in rep.items() if c > 2) / max(len(rep), 1) if rep else 0.0 + ) + + md = "" + try: + md = doc.export_to_markdown() + except Exception: + pass + + replacement = md.count("\ufffd") + sum(str(t).count("\ufffd") for t in samples) + + return { + "page_count": n_pages, + "section_headers": n_headers, + "text_items": n_text_items, + "total_text_chars": total_chars, + "chars_per_page": round(density, 2), + "tables": n_tables, + "pictures": n_pictures, + "markdown_chars": len(md), + "replacement_chars": replacement, + "most_repeated_text_count": int(top_rep[1]) if top_rep else 0, + "duplicate_heavy": dup_ratio > 0.15 and len(samples) > 10, + } + + +def heuristic_metrics(data: dict) -> dict[str, Any]: + """Fallback when DoclingDocument cannot be validated (older export / drift).""" + texts = data.get("texts") or [] + tables = data.get("tables") or [] + body = data.get("body") or {} + children = body.get("children") if isinstance(body, dict) else None + n_children = len(children) if isinstance(children, list) else 0 + char_sum = 0 + for t in texts: + if isinstance(t, dict): + char_sum += len(str(t.get("text") or "")) + return { + "page_count": 0, + "section_headers": 0, + "text_items": len(texts), + "total_text_chars": char_sum, + "chars_per_page": 0.0, + "tables": len(tables), + "pictures": len(data.get("pictures") or []), + "markdown_chars": 0, + "replacement_chars": 0, + "most_repeated_text_count": 0, + "duplicate_heavy": False, + "heuristic_only": True, + "body_children": n_children, + } + + +def evaluate( + m: dict[str, Any], + *, + expect_tables: bool, + min_chars_per_page: float, + min_markdown_chars: int, +) -> tuple[str, list[str], list[str]]: + issues: list[str] = [] + actions: list[str] = [] + + if m.get("heuristic_only"): + issues.append("Could not load full DoclingDocument; metrics are partial.") + actions.append( + "Ensure docling-core matches export; re-export with scripts/docling-convert.py --format json" + ) + + cpp = m.get("chars_per_page") or 0 + if m.get("page_count", 0) >= 2 and cpp < min_chars_per_page: + issues.append( + f"Low text density ({cpp} chars/page); likely scan, image-heavy PDF, or extraction gap." + ) + actions.append( + "Retry: standard pipeline with --ocr-engine tesseract, rapidocr, or mac" + ) + actions.append("Retry: --pipeline vlm-local (or vlm-api if GPU/API available)") + + if m.get("replacement_chars", 0) > 5: + issues.append( + "Unicode replacement characters detected; OCR may be garbling text." + ) + actions.append("Retry: --ocr-engine tesseract or rapidocr") + actions.append( + "Retry: --pipeline vlm-local --force-backend-text for hybrid text+VLM" + ) + + if m.get("duplicate_heavy") or (m.get("most_repeated_text_count", 0) > 8): + issues.append( + "Repeated text blocks; possible layout/OCR loop or bad reading order." + ) + actions.append("Retry: --pipeline vlm-local for complex layout") + actions.append("If using VLM: try --force-backend-text for text-heavy pages") + + if expect_tables and m.get("tables", 0) == 0: + issues.append("No tables detected but tables were expected.") + actions.append("Retry: standard pipeline without --no-tables") + actions.append("Retry: --pipeline vlm-local for merged-cell or visual tables") + + mc = m.get("markdown_chars", 0) + if mc > 0 and mc < min_markdown_chars and m.get("page_count", 0) >= 1: + issues.append(f"Markdown export is very short ({mc} chars) for the page count.") + actions.append("Retry: OCR/VLM pipelines as above") + + if m.get("text_items", 0) == 0 and m.get("page_count", 0) == 0: + issues.append( + "No text items and no page provenance; export may be empty or invalid." + ) + actions.append( + "Verify source file opens correctly; retry with explicit --pipeline standard" + ) + + seen = set() + uniq_actions = [] + for a in actions: + if a not in seen: + seen.add(a) + uniq_actions.append(a) + + if not issues: + return "pass", [], [] + + severe = m.get("text_items", 0) == 0 or ( + m.get("page_count", 0) >= 1 and mc < 50 and mc > 0 + ) + status = "fail" if severe or m.get("replacement_chars", 0) > 20 else "warn" + return status, issues, uniq_actions + + +def parse_args(): + p = argparse.ArgumentParser(description="Evaluate Docling JSON export quality") + p.add_argument( + "json_path", type=Path, help="Path to DoclingDocument JSON (export_to_dict)" + ) + p.add_argument( + "--markdown", + type=Path, + default=None, + help="Optional markdown file to cross-check length", + ) + p.add_argument("--expect-tables", action="store_true") + p.add_argument("--min-chars-per-page", type=float, default=120.0) + p.add_argument("--min-markdown-chars", type=int, default=200) + p.add_argument("--fail-on-warn", action="store_true") + p.add_argument( + "--quiet", action="store_true", help="Only print JSON report to stdout" + ) + return p.parse_args() + + +def main() -> None: + args = parse_args() + if not args.json_path.is_file(): + print(json.dumps({"error": f"not found: {args.json_path}"}), file=sys.stderr) + sys.exit(1) + + doc, raw = load_document(args.json_path) + if doc is not None: + m = metrics_from_doc(doc) + else: + m = heuristic_metrics(raw) + + if args.markdown and args.markdown.is_file(): + md_len = len(args.markdown.read_text(encoding="utf-8")) + m["markdown_file_chars"] = md_len + if m.get("markdown_chars", 0) == 0: + m["markdown_chars"] = md_len + + status, issues, actions = evaluate( + m, + expect_tables=args.expect_tables, + min_chars_per_page=args.min_chars_per_page, + min_markdown_chars=args.min_markdown_chars, + ) + + report = { + "status": status, + "metrics": m, + "issues": issues, + "recommended_actions": actions, + "next_steps_for_agent": [ + "Re-run scripts/docling-convert.py with flags from recommended_actions.", + "Re-export JSON and run this script again until status is pass.", + "Append a row to improvement-log.md (see SKILL.md).", + ], + } + + print(json.dumps(report, indent=2, ensure_ascii=False)) + if not args.quiet: + print(f"\nstatus={status}", file=sys.stderr) + if issues: + print("issues:", file=sys.stderr) + for i in issues: + print(f" - {i}", file=sys.stderr) + if actions: + print("recommended_actions:", file=sys.stderr) + for a in actions: + print(f" - {a}", file=sys.stderr) + + if status == "fail": + sys.exit(1) + if status == "warn" and args.fail_on_warn: + sys.exit(1) + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/docs/examples/agent_skill/docling-document-intelligence/scripts/requirements.txt b/docs/examples/agent_skill/docling-document-intelligence/scripts/requirements.txt new file mode 100644 index 0000000000..b4272960a5 --- /dev/null +++ b/docs/examples/agent_skill/docling-document-intelligence/scripts/requirements.txt @@ -0,0 +1,4 @@ +# From bundle root: pip install -r scripts/requirements.txt +docling>=2.81.0 +docling-core>=2.67.1 +packaging>=23.0 diff --git a/docs/examples/index.md b/docs/examples/index.md index f7d0fdcaac..4e910609e7 100644 --- a/docs/examples/index.md +++ b/docs/examples/index.md @@ -7,6 +7,7 @@ Here some of our picks to get you started: - 📤 [{==\[:fontawesome-solid-flask:{ title="beta feature" } beta\]==} structured data extraction](./extraction.ipynb) - examples for ✍️ [serialization](./serialization.ipynb) and ✂️ [chunking](./hybrid_chunking.ipynb), including [user-defined customizations](./advanced_chunking_and_serialization.ipynb) - 🖼️ [picture annotations](./pictures_description.ipynb) and [enrichments](./enrich_doclingdocument.py) +- 🤝 [**Agent skill**](./agent_skill/docling-document-intelligence/README.md) for Cursor and other assistants (`SKILL.md`, pipeline reference, `docling-convert.py` / `docling-evaluate.py` helpers) 👈 ... and there is much more: explore all the examples using the navigation menu on the side diff --git a/mkdocs.yml b/mkdocs.yml index 029f1d883e..7aba8cbd3a 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -80,6 +80,7 @@ nav: - Plugins: concepts/plugins.md - Examples: - Examples: examples/index.md + - "🤝 Agent skill (Cursor / assistants)": examples/agent_skill/docling-document-intelligence/README.md - 🔀 Conversion: - "Simple conversion": examples/minimal.py - "Custom conversion": examples/custom_convert.py From 041e709c66f23e15823b3249688b6222a0aa31d8 Mon Sep 17 00:00:00 2001 From: jehlum11 Date: Mon, 23 Mar 2026 15:28:00 -0400 Subject: [PATCH 2/5] docs: align agent skill README and EXAMPLE with Cursor bundle - Document both ~/.cursor/skills and docs/examples paths. - README notes repo parity for PRs and local installs. Made-with: Cursor --- .../docling-document-intelligence/EXAMPLE.md | 14 ++++++++++---- .../docling-document-intelligence/README.md | 12 +++++------- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/docs/examples/agent_skill/docling-document-intelligence/EXAMPLE.md b/docs/examples/agent_skill/docling-document-intelligence/EXAMPLE.md index ab1ae8986a..e05ba5fa83 100644 --- a/docs/examples/agent_skill/docling-document-intelligence/EXAMPLE.md +++ b/docs/examples/agent_skill/docling-document-intelligence/EXAMPLE.md @@ -1,15 +1,21 @@ # Using the Docling agent skill -[Agent Skills](https://agentskills.io/specification) are folders of instructions that AI coding agents (Cursor, Claude Code, GitHub Copilot, etc.) can load when relevant. This bundle lives in the Docling repo at: +[Agent Skills](https://agentskills.io/specification) are folders of instructions that AI coding agents (Cursor, Claude Code, GitHub Copilot, etc.) can load when relevant. -`docs/examples/agent_skill/docling-document-intelligence/` +## Where this bundle lives + +- **Cursor (local):** `~/.cursor/skills/docling-document-intelligence/` (or copy this folder there). +- **Docling repository (docs + PRs):** `docs/examples/agent_skill/docling-document-intelligence/` in [github.com/docling-project/docling](https://github.com/docling-project/docling). + +The two trees are kept in sync; use either source. ## Install (copy into your agent’s skills directory) ```bash -# From a checkout of github.com/docling-project/docling +# From a checkout of the Docling repo cp -r docs/examples/agent_skill/docling-document-intelligence ~/.cursor/skills/ -# or e.g. ~/.claude/skills/ depending on your tool + +# Or copy from another machine / archive into e.g. ~/.claude/skills/ ``` No extra config is required beyond installing Python dependencies (below). diff --git a/docs/examples/agent_skill/docling-document-intelligence/README.md b/docs/examples/agent_skill/docling-document-intelligence/README.md index 7d077e75fd..d43fcc9852 100644 --- a/docs/examples/agent_skill/docling-document-intelligence/README.md +++ b/docs/examples/agent_skill/docling-document-intelligence/README.md @@ -1,16 +1,18 @@ # Docling agent skill (Cursor & compatible assistants) -This folder is an **[Agent Skill](https://agentskills.io/specification)**-style bundle for AI coding assistants: structured instructions (`SKILL.md`), a pipeline reference (`pipelines.md`), optional helper scripts under `scripts/`, and an evaluator for conversion quality. +This folder is an **[Agent Skill](https://agentskills.io/specification)**-style bundle for AI coding assistants: structured instructions (`SKILL.md`), a pipeline reference (`pipelines.md`), helper scripts under `scripts/`, and an evaluator for conversion quality. It complements the official [Docling documentation](https://docling-project.github.io/docling/) and the [`docling` CLI](https://docling-project.github.io/docling/reference/cli/); use it when you want agents to follow a consistent **convert → export JSON → evaluate → refine** workflow. +The same layout is published in the Docling repo at `docs/examples/agent_skill/docling-document-intelligence/` (for docs and PRs). + ## Contents | Path | Purpose | |------|---------| | [`SKILL.md`](SKILL.md) | Full skill instructions (pipelines, chunking, evaluation loop) | | [`pipelines.md`](pipelines.md) | Standard vs VLM pipelines, OCR engines, API notes | -| [`EXAMPLE.md`](EXAMPLE.md) | Copying the skill into `~/.cursor/skills/` or similar; running scripts | +| [`EXAMPLE.md`](EXAMPLE.md) | Installing into `~/.cursor/skills/`; running scripts | | [`improvement-log.md`](improvement-log.md) | Optional template for local “what worked” notes | | [`scripts/docling-convert.py`](scripts/docling-convert.py) | CLI: Markdown / JSON / RAG chunks | | [`scripts/docling-evaluate.py`](scripts/docling-evaluate.py) | Heuristic quality report on JSON (+ optional Markdown) | @@ -27,10 +29,6 @@ python3 scripts/docling-evaluate.py /tmp/out.json --markdown /tmp/out.md Use `--pipeline vlm-local` or `--pipeline vlm-api` for vision-model pipelines; see `SKILL.md` and `pipelines.md`. -## Using as a Cursor / Claude skill - -Copy the folder `docling-document-intelligence` into your tool’s skills directory (see [`EXAMPLE.md`](EXAMPLE.md)). The `SKILL.md` frontmatter describes when the skill should activate. - ## License -Contributed under the same terms as the [Docling](https://github.com/docling-project/docling) repository (MIT). +MIT (aligned with [Docling](https://github.com/docling-project/docling)). From 32baebbcb0b96cd66ffa1369ed1fdebbb12dee47 Mon Sep 17 00:00:00 2001 From: jehlum11 Date: Mon, 23 Mar 2026 15:38:31 -0400 Subject: [PATCH 3/5] DCO Remediation Commit for jehlum11 I, jehlum11 , hereby add my Signed-off-by to this commit: 2d268ffb6fcc4c0e09f9a1482204a2e2501123d4 I, jehlum11 , hereby add my Signed-off-by to this commit: 041e709c66f23e15823b3249688b6222a0aa31d8 Signed-off-by: jehlum11 Made-with: Cursor From f4744220c61c2bd436e11ad291d92c72170c8d7d Mon Sep 17 00:00:00 2001 From: jehlum11 Date: Sun, 29 Mar 2026 19:37:41 -0400 Subject: [PATCH 4/5] docs: refactor agent skill to use docling CLI for conversion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Address maintainer feedback: the custom docling-convert.py script was largely redundant with the existing docling CLI. This commit: - Removes scripts/docling-convert.py (redundant with `docling` CLI) - Refactors SKILL.md (v1.4 → v2.0) to use `docling` CLI for all conversion tasks, reserving the Python API only for features the CLI does not expose (chunking, VLM API endpoint config, force_backend_text hybrid mode) - Updates docling-evaluate.py recommended_actions to reference `docling` CLI flags instead of the removed script - Updates README.md, EXAMPLE.md, pipelines.md to use `docling` CLI examples throughout - Simplifies requirements.txt (removes packaging dependency) The only custom script retained is docling-evaluate.py, which provides heuristic quality evaluation — functionality the CLI does not cover. Signed-off-by: jehlum11 Made-with: Cursor --- .../docling-document-intelligence/EXAMPLE.md | 44 ++- .../docling-document-intelligence/README.md | 33 +- .../docling-document-intelligence/SKILL.md | 186 +++++---- .../pipelines.md | 97 +++-- .../scripts/docling-convert.py | 360 ------------------ .../scripts/docling-evaluate.py | 31 +- .../scripts/requirements.txt | 3 +- 7 files changed, 243 insertions(+), 511 deletions(-) delete mode 100644 docs/examples/agent_skill/docling-document-intelligence/scripts/docling-convert.py diff --git a/docs/examples/agent_skill/docling-document-intelligence/EXAMPLE.md b/docs/examples/agent_skill/docling-document-intelligence/EXAMPLE.md index e05ba5fa83..b6993b1646 100644 --- a/docs/examples/agent_skill/docling-document-intelligence/EXAMPLE.md +++ b/docs/examples/agent_skill/docling-document-intelligence/EXAMPLE.md @@ -9,7 +9,7 @@ The two trees are kept in sync; use either source. -## Install (copy into your agent’s skills directory) +## Install (copy into your agent's skills directory) ```bash # From a checkout of the Docling repo @@ -40,39 +40,44 @@ Chunk invoice.pdf for RAG ingestion with 512 token chunks Process scanned.pdf using the VLM pipeline ``` -The agent should read `SKILL.md`, match the task, and run the appropriate pipeline. +The agent should read `SKILL.md`, match the task, and run the appropriate +`docling` CLI command or Python API call. -## Running the helper scripts directly - -From the **bundle root** (the `docling-document-intelligence` directory): +## Running the docling CLI directly ```bash -pip install -r scripts/requirements.txt +pip install docling docling-core + +# Basic conversion to Markdown +docling report.pdf --output /tmp/ -python3 scripts/docling-convert.py report.pdf +# JSON output +docling report.pdf --to json --output /tmp/ -python3 scripts/docling-convert.py report.pdf --ocr-engine rapidocr +# Custom OCR engine +docling report.pdf --ocr-engine rapidocr --output /tmp/ -python3 scripts/docling-convert.py report.pdf --format chunks --max-tokens 512 +# VLM pipeline +docling scanned.pdf --pipeline vlm --output /tmp/ -python3 scripts/docling-convert.py scanned.pdf --pipeline vlm-local +# VLM with specific model +docling scanned.pdf --pipeline vlm --vlm-model granite_docling --output /tmp/ -python3 scripts/docling-convert.py doc.pdf \ - --pipeline vlm-api \ - --vlm-api-url http://localhost:8000/v1/chat/completions \ - --vlm-api-model ibm-granite/granite-docling-258M +# Remote VLM services +docling doc.pdf --pipeline vlm --enable-remote-services --output /tmp/ ``` ## Evaluate and refine ```bash -python3 scripts/docling-convert.py report.pdf --format json --out /tmp/doc.json -python3 scripts/docling-convert.py report.pdf --format markdown --out /tmp/doc.md -python3 scripts/docling-evaluate.py /tmp/doc.json --markdown /tmp/doc.md +docling report.pdf --to json --output /tmp/ +docling report.pdf --to md --output /tmp/ +python3 scripts/docling-evaluate.py /tmp/report.json --markdown /tmp/report.md ``` -If the report shows `warn` or `fail`, follow `recommended_actions`, re-convert, -and optionally append a note to `improvement-log.md` (see `SKILL.md` section 6). +If the report shows `warn` or `fail`, follow `recommended_actions`, re-convert +with `docling` using the suggested flags, and optionally append a note to +`improvement-log.md` (see `SKILL.md` section 7). ## What the skill covers @@ -90,4 +95,5 @@ and optionally append a note to `improvement-log.md` (see `SKILL.md` section 6). - [Agent Skills specification](https://agentskills.io/specification) - [Docling documentation](https://docling-project.github.io/docling/) +- [Docling CLI reference](https://docling-project.github.io/docling/reference/cli/) - [Docling GitHub](https://github.com/docling-project/docling) diff --git a/docs/examples/agent_skill/docling-document-intelligence/README.md b/docs/examples/agent_skill/docling-document-intelligence/README.md index d43fcc9852..65382ccb9d 100644 --- a/docs/examples/agent_skill/docling-document-intelligence/README.md +++ b/docs/examples/agent_skill/docling-document-intelligence/README.md @@ -1,8 +1,12 @@ # Docling agent skill (Cursor & compatible assistants) -This folder is an **[Agent Skill](https://agentskills.io/specification)**-style bundle for AI coding assistants: structured instructions (`SKILL.md`), a pipeline reference (`pipelines.md`), helper scripts under `scripts/`, and an evaluator for conversion quality. +This folder is an **[Agent Skill](https://agentskills.io/specification)**-style bundle for AI coding assistants: structured instructions (`SKILL.md`), a pipeline reference (`pipelines.md`), and a quality evaluator (`scripts/docling-evaluate.py`). -It complements the official [Docling documentation](https://docling-project.github.io/docling/) and the [`docling` CLI](https://docling-project.github.io/docling/reference/cli/); use it when you want agents to follow a consistent **convert → export JSON → evaluate → refine** workflow. +Conversion is done via the **`docling` CLI** (included with `pip install docling`). +The evaluator provides a **convert → evaluate → refine** feedback loop that the +existing CLI does not cover. + +It complements the official [Docling documentation](https://docling-project.github.io/docling/) and the [`docling` CLI reference](https://docling-project.github.io/docling/reference/cli/). The same layout is published in the Docling repo at `docs/examples/agent_skill/docling-document-intelligence/` (for docs and PRs). @@ -12,22 +16,27 @@ The same layout is published in the Docling repo at `docs/examples/agent_skill/d |------|---------| | [`SKILL.md`](SKILL.md) | Full skill instructions (pipelines, chunking, evaluation loop) | | [`pipelines.md`](pipelines.md) | Standard vs VLM pipelines, OCR engines, API notes | -| [`EXAMPLE.md`](EXAMPLE.md) | Installing into `~/.cursor/skills/`; running scripts | -| [`improvement-log.md`](improvement-log.md) | Optional template for local “what worked” notes | -| [`scripts/docling-convert.py`](scripts/docling-convert.py) | CLI: Markdown / JSON / RAG chunks | +| [`EXAMPLE.md`](EXAMPLE.md) | Installing into `~/.cursor/skills/`; running the CLI and evaluator | +| [`improvement-log.md`](improvement-log.md) | Optional template for local "what worked" notes | | [`scripts/docling-evaluate.py`](scripts/docling-evaluate.py) | Heuristic quality report on JSON (+ optional Markdown) | -| [`scripts/requirements.txt`](scripts/requirements.txt) | Minimal pip deps for the scripts | +| [`scripts/requirements.txt`](scripts/requirements.txt) | Minimal pip deps for the evaluator | -## Quick start (from this directory) +## Quick start ```bash -pip install -r scripts/requirements.txt -python3 scripts/docling-convert.py https://arxiv.org/pdf/2408.09869 --out /tmp/out.md -python3 scripts/docling-convert.py https://arxiv.org/pdf/2408.09869 --format json --out /tmp/out.json -python3 scripts/docling-evaluate.py /tmp/out.json --markdown /tmp/out.md +pip install docling docling-core + +# Convert to Markdown +docling https://arxiv.org/pdf/2408.09869 --output /tmp/ + +# Convert to JSON +docling https://arxiv.org/pdf/2408.09869 --to json --output /tmp/ + +# Evaluate quality +python3 scripts/docling-evaluate.py /tmp/2408.09869.json --markdown /tmp/2408.09869.md ``` -Use `--pipeline vlm-local` or `--pipeline vlm-api` for vision-model pipelines; see `SKILL.md` and `pipelines.md`. +Use `--pipeline vlm` for vision-model pipelines; see `SKILL.md` and `pipelines.md`. ## License diff --git a/docs/examples/agent_skill/docling-document-intelligence/SKILL.md b/docs/examples/agent_skill/docling-document-intelligence/SKILL.md index 9c5ca1b1d7..7e3927a680 100644 --- a/docs/examples/agent_skill/docling-document-intelligence/SKILL.md +++ b/docs/examples/agent_skill/docling-document-intelligence/SKILL.md @@ -14,9 +14,9 @@ license: MIT compatibility: Requires Python 3.10+, docling>=2.81.0, docling-core>=2.67.1 metadata: author: docling-project - version: "1.4" + version: "2.0" upstream: https://github.com/docling-project/docling -allowed-tools: Bash(python3:*) Bash(pip:*) +allowed-tools: Bash(docling:*) Bash(python3:*) Bash(pip:*) --- # Docling Document Intelligence Skill @@ -25,6 +25,10 @@ Use this skill to parse, convert, chunk, and analyze documents with Docling. It handles both local file paths and URLs, and outputs either Markdown or structured JSON (`DoclingDocument`). +Conversion uses the **`docling` CLI** (installed with `pip install docling`). +The Python API is used only for features the CLI does not expose (chunking, +VLM remote-API endpoint configuration, hybrid `force_backend_text` mode). + ## Scope | Task | Covered | @@ -32,8 +36,8 @@ structured JSON (`DoclingDocument`). | Parse PDF / DOCX / PPTX / HTML / image | ✅ | | Convert to Markdown | ✅ | | Export as DoclingDocument JSON | ✅ | -| Chunk for RAG (hybrid: heading + token) | ✅ | -| Analyze structure (headings, tables, figures) | ✅ | +| Chunk for RAG (hybrid: heading + token) | ✅ (Python API) | +| Analyze structure (headings, tables, figures) | ✅ (Python API) | | OCR for scanned PDFs | ✅ (auto-enabled) | | Multi-source batch conversion | ✅ | @@ -42,33 +46,64 @@ structured JSON (`DoclingDocument`). ### 1. Resolve the input Determine whether the user supplied a **local path** or a **URL**. +The `docling` CLI accepts both directly. -- Local path → pass as `str` or `Path` directly to `DocumentConverter` -- URL → pass as `str`; Docling fetches it automatically -- Multiple inputs → pass a list - -```python -sources = ["path/to/file.pdf"] # local -sources = ["https://example.com/a.pdf"] # URL -sources = ["file1.pdf", "file2.docx"] # batch +```bash +docling path/to/file.pdf +docling https://example.com/a.pdf ``` ### 2. Choose a pipeline -Docling has three pipelines. Pick based on document type and hardware. +Docling has two pipeline families. Pick based on document type and hardware. -| Pipeline | Best for | Key tradeoff | -|---|---|---| -| **Standard** (default) | Born-digital PDFs, speed | No GPU needed; OCR for scanned pages | -| **VLM local** | Complex layouts, handwriting, formulas | Needs GPU; slower | -| **VLM API** | Production scale, remote inference | Requires inference server | +| Pipeline | CLI flag | Best for | Key tradeoff | +|---|---|---|---| +| **Standard** (default) | `--pipeline standard` | Born-digital PDFs, speed | No GPU needed; OCR for scanned pages | +| **VLM** | `--pipeline vlm` | Complex layouts, handwriting, formulas | Needs GPU; slower | See [pipelines.md](pipelines.md) for the full decision matrix, OCR engine table -(EasyOCR, RapidOCR, Tesseract, macOS; Tesseract CLI and future engines such as -Nemotron in Python only when supported by your Docling version), and VLM presets. +(EasyOCR, RapidOCR, Tesseract, macOS), and VLM model presets. ### 3. Convert the document +#### CLI (preferred for straightforward conversions) + +```bash +# Markdown (default output) +docling report.pdf --output /tmp/ + +# JSON (structured, lossless) +docling report.pdf --to json --output /tmp/ + +# VLM pipeline +docling report.pdf --pipeline vlm --output /tmp/ + +# VLM with specific model +docling report.pdf --pipeline vlm --vlm-model granite_docling --output /tmp/ + +# Custom OCR engine +docling report.pdf --ocr-engine tesserocr --output /tmp/ + +# Disable OCR or tables for speed +docling report.pdf --no-ocr --output /tmp/ +docling report.pdf --no-tables --output /tmp/ + +# Remote VLM services +docling report.pdf --pipeline vlm --enable-remote-services --output /tmp/ +``` + +The CLI writes output files to the `--output` directory, named after the +input file (e.g. `report.pdf` → `report.md` or `report.json`). + +**CLI reference:** + +#### Python API (for advanced features) + +Use the Python API when you need features the CLI does not expose: +chunking, VLM remote-API endpoint configuration, or hybrid +`force_backend_text` mode. + **Docling 2.81+ API note:** `DocumentConverter(format_options=...)` expects `dict[InputFormat, FormatOption]` (e.g. `InputFormat.PDF` → `PdfFormatOption`). Using string keys like `{"pdf": PdfPipelineOptions(...)}` fails at runtime with @@ -80,11 +115,9 @@ from docling.document_converter import DocumentConverter, PdfFormatOption from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import PdfPipelineOptions -# Defaults: standard PDF pipeline, OCR + tables converter = DocumentConverter() -result = converter.convert(sources[0]) +result = converter.convert("report.pdf") -# Custom PdfPipelineOptions (same API as scripts/docling-convert.py --pipeline standard) converter = DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption( @@ -92,7 +125,7 @@ converter = DocumentConverter( ), } ) -result = converter.convert(sources[0]) +result = converter.convert("report.pdf") ``` **VLM pipeline — local (GraniteDocling via HF Transformers):** @@ -115,10 +148,14 @@ converter = DocumentConverter( ) } ) -result = converter.convert(sources[0]) +result = converter.convert("report.pdf") ``` **VLM pipeline — remote API (vLLM / LM Studio / Ollama):** + +This is only available via the Python API; the CLI does not expose endpoint +URL, model name, or API key configuration. + ```python from docling.document_converter import DocumentConverter, PdfFormatOption from docling.datamodel.base_models import InputFormat @@ -146,29 +183,44 @@ converter = DocumentConverter( ) } ) -result = converter.convert(sources[0]) +result = converter.convert("report.pdf") ``` -`result.document` is a `DoclingDocument` object in all three cases. +**Hybrid mode (force_backend_text) — Python API only:** -### 3. Choose output format +Uses deterministic PDF text extraction for text regions while routing +images and tables through the VLM. Reduces hallucination on text-heavy pages. -**Markdown** (default, human-readable): ```python -md = result.document.export_to_markdown() +pipeline_options = VlmPipelineOptions( + vlm_options=vlm_model_specs.GRANITEDOCLING_TRANSFORMERS, + force_backend_text=True, + generate_page_images=True, +) +``` + +`result.document` is a `DoclingDocument` object in all cases. + +### 4. Choose output format + +**Markdown** (default, human-readable): +```bash +docling report.pdf --to md --output /tmp/ ``` +Or via Python: `result.document.export_to_markdown()` **JSON / DoclingDocument** (structured, lossless): -```python -import json -doc_json = result.document.model_dump() # dict -doc_json_str = result.document.export_to_dict() # serialisable dict +```bash +docling report.pdf --to json --output /tmp/ ``` +Or via Python: `result.document.export_to_dict()` > If the user does not specify a format, ask: "Should I output Markdown or > structured JSON (DoclingDocument)?" -### 4. Chunk for RAG (hybrid strategy) +### 5. Chunk for RAG (hybrid strategy) + +Chunking is only available via the Python API. Default: **hybrid chunker** — splits first by heading hierarchy, then subdivides oversized sections by token count. This preserves semantic @@ -190,8 +242,6 @@ chunker = HybridChunker(tokenizer=tokenizer, merge_peers=True) chunks = list(chunker.chunk(result.document)) for chunk in chunks: - # contextualize() is the correct method for embedding-ready text — - # it enriches chunk.text with heading breadcrumb metadata embed_text = chunker.contextualize(chunk) print(chunk.meta.headings) # heading breadcrumb list print(chunk.meta.origin.page_no) # source page number @@ -212,31 +262,28 @@ tokenizer = OpenAITokenizer( For chunking strategies and tokenizer details, see the Docling documentation on chunking and `HybridChunker`. -### 5. Analyze document structure +### 6. Analyze document structure Use the `DoclingDocument` object directly to inspect structure: ```python doc = result.document -# Iterate headings for item, level in doc.iterate_items(): if hasattr(item, 'label') and item.label.name == 'SECTION_HEADER': print(f"{'#' * level} {item.text}") -# Extract tables for table in doc.tables: print(table.export_to_dataframe()) # pandas DataFrame print(table.export_to_markdown()) -# Extract figures / images for picture in doc.pictures: print(picture.caption_text(doc)) # caption if present ``` -For the full API surface, see Docling’s structure and table export docs. +For the full API surface, see Docling's structure and table export docs. -### 6. Evaluate output and iterate (required for “best effort” conversions) +### 7. Evaluate output and iterate (required for "best effort" conversions) After **every** conversion where the user cares about fidelity (not quick previews), run the bundled evaluator on the JSON export, then refine the @@ -246,30 +293,29 @@ the run** without guessing. **Step A — Produce JSON and optional Markdown** ```bash -# From the bundle root (directory containing scripts/ and SKILL.md): -python3 scripts/docling-convert.py "" --format json --out /tmp/docling-out.json -python3 scripts/docling-convert.py "" --format markdown --out /tmp/docling-out.md +docling "" --to json --output /tmp/ +docling "" --to md --output /tmp/ ``` **Step B — Evaluate** ```bash -python3 scripts/docling-evaluate.py /tmp/docling-out.json --markdown /tmp/docling-out.md +python3 scripts/docling-evaluate.py /tmp/.json --markdown /tmp/.md ``` If the user expects tables (invoices, spreadsheets in PDF), add `--expect-tables`. Tighten gates with `--fail-on-warn` in CI-style checks. The script prints a JSON report to stdout: `status` (`pass` | `warn` | `fail`), -`metrics`, `issues`, and `recommended_actions` (concrete `scripts/docling-convert.py` +`metrics`, `issues`, and `recommended_actions` (concrete `docling` CLI flags to try next). **Step C — Refinement loop (max 3 attempts unless the user says otherwise)** 1. If `status` is `warn` or `fail`, apply **one** primary change from - `recommended_actions` (e.g. switch standard → VLM, change OCR engine, - ensure tables are enabled, hybrid `--force-backend-text`). -2. Re-convert, re-export JSON, re-run `scripts/docling-evaluate.py`. + `recommended_actions` (e.g. switch `--pipeline vlm`, change + `--ocr-engine`, ensure tables are enabled). +2. Re-convert with `docling`, re-run `scripts/docling-evaluate.py`. 3. Stop when `status` is `pass`, or after 3 iterations — then summarize what worked and any remaining issues for the user. @@ -286,33 +332,33 @@ After a successful pass **or** after the final iteration, append one entry to This log is optional for the user to git-ignore; it is for **local** learning so future runs on similar documents start closer to the right pipeline. -### 7. Agent quality checklist (manual, if script unavailable) +### 8. Agent quality checklist (manual, if script unavailable) If `scripts/docling-evaluate.py` cannot run, still verify: | Check | Action if bad | |---|---| -| Page count matches source (roughly) | Re-run; try VLM if layout is complex | +| Page count matches source (roughly) | Re-run; try `--pipeline vlm` if layout is complex | | Markdown is not near-empty | Enable OCR / VLM | -| Tables missing when visually obvious | Enable table structure; try VLM | -| `\ufffd` replacement characters | Different OCR or VLM | -| Same line repeated many times | VLM or hybrid `--force-backend-text` | +| Tables missing when visually obvious | Remove `--no-tables`; try `--pipeline vlm` | +| `\ufffd` replacement characters | Different `--ocr-engine` or `--pipeline vlm` | +| Same line repeated many times | `--pipeline vlm` or hybrid `force_backend_text` (Python API) | ## Common Edge Cases | Situation | Handling | |---|---| -| Scanned / image-only PDF | Standard pipeline with OCR, or VLM pipeline for best quality | -| Password-protected PDF | Will raise `ConversionError`; surface to user | -| Very large document (500+ pages) | Standard pipeline with `do_table_structure=False` for speed | -| Complex layout / multi-column | Prefer VLM pipeline; standard may misorder reading flow | -| Handwriting or formulas | VLM pipeline only — standard OCR will not handle these | +| Scanned / image-only PDF | Standard pipeline with OCR, or `--pipeline vlm` for best quality | +| Password-protected PDF | `--pdf-password PASSWORD`; will raise `ConversionError` if wrong | +| Very large document (500+ pages) | Standard pipeline with `--no-tables` for speed | +| Complex layout / multi-column | `--pipeline vlm`; standard may misorder reading flow | +| Handwriting or formulas | `--pipeline vlm` only — standard OCR will not handle these | | URL behind auth | Pre-download to temp file; pass local path | -| Tables with merged cells | `table.export_to_markdown()` handles spans; VLM pipeline often more accurate | +| Tables with merged cells | `table.export_to_markdown()` handles spans; VLM often more accurate | | Non-UTF-8 encoding | Docling normalises internally; no special handling needed | -| VLM hallucinating text | Set `force_backend_text=True` for hybrid mode (PDF text + VLM layout) | -| VLM API call blocked | `enable_remote_services=True` is mandatory on `VlmPipelineOptions` | -| Apple Silicon | Use `GRANITEDOCLING_MLX` preset for MPS acceleration | +| VLM hallucinating text | `force_backend_text=True` via Python API for hybrid mode | +| VLM API call blocked | `--enable-remote-services` (CLI) or `enable_remote_services=True` (Python) | +| Apple Silicon | `--vlm-model granite_docling` with MLX backend, or `GRANITEDOCLING_MLX` preset (Python API) | ## Pipeline reference @@ -331,20 +377,14 @@ server configuration: [pipelines.md](pipelines.md) ## Dependencies -Install from the bundled requirements file (always pulls latest compatible): - -```bash -pip install -r scripts/requirements.txt -``` - -Or manually: - ```bash pip install docling docling-core # For OpenAI tokenizer support: pip install 'docling-core[chunking-openai]' ``` +The `docling` CLI is included with the `docling` package — no separate install needed. + Check installed versions (prefer distribution metadata — `docling` may not set `__version__`): ```python diff --git a/docs/examples/agent_skill/docling-document-intelligence/pipelines.md b/docs/examples/agent_skill/docling-document-intelligence/pipelines.md index e50d8af140..d52208fa18 100644 --- a/docs/examples/agent_skill/docling-document-intelligence/pipelines.md +++ b/docs/examples/agent_skill/docling-document-intelligence/pipelines.md @@ -1,8 +1,8 @@ # Docling Pipelines Reference Docling has two pipeline families for PDFs: **standard** (parse + OCR + layout/tables) -and **VLM** (page images through a vision-language model). The helper -`scripts/docling-convert.py` exposes **three modes**: `standard`, `vlm-local`, `vlm-api`. +and **VLM** (page images through a vision-language model). The `docling` CLI +exposes both via `--pipeline standard` (default) and `--pipeline vlm`. The right choice depends on document type, hardware, and latency budget. --- @@ -13,11 +13,11 @@ The right choice depends on document type, hardware, and latency budget. |---|---|---| | Born-digital PDF (text selectable) | Standard | Fast, accurate, no GPU needed | | Scanned PDF / image-only | Standard + OCR or VLM | Depends on quality | -| Complex layout (multi-column, dense tables) | VLM local | Better structural understanding | +| Complex layout (multi-column, dense tables) | VLM | Better structural understanding | | Handwriting, formulas, figures with embedded text | VLM | Only viable option | | Air-gapped / no GPU | Standard | Runs on CPU | -| Production scale, GPU server available | VLM API (vLLM) | Best throughput | -| Apple Silicon / local dev | VLM local (MLX) | MPS acceleration | +| Production scale, GPU server available | VLM (vLLM) | Best throughput | +| Apple Silicon / local dev | VLM (MLX) | MPS acceleration | | Speed-critical, accuracy secondary | Standard, no tables | Fastest path | --- @@ -27,6 +27,22 @@ The right choice depends on document type, hardware, and latency budget. Uses deterministic PDF parsing (docling-parse) + optional neural OCR + neural table structure detection. +### CLI usage + +```bash +# Default (standard pipeline, OCR + tables enabled) +docling report.pdf --output /tmp/ + +# Custom OCR engine +docling report.pdf --ocr-engine tesserocr --output /tmp/ + +# Disable OCR or tables +docling report.pdf --no-ocr --output /tmp/ +docling report.pdf --no-tables --output /tmp/ +``` + +### Python API + ```python from docling.document_converter import DocumentConverter, PdfFormatOption from docling.datamodel.base_models import InputFormat @@ -50,7 +66,20 @@ converter = DocumentConverter( ### OCR engine options -All engines are plug-and-play via `ocr_options`. Default is EasyOCR. +All engines are plug-and-play via the CLI `--ocr-engine` flag or the Python +`ocr_options` parameter. Default is EasyOCR. + +#### CLI flags + +| Engine | CLI flag | Notes | +|--------|----------|-------| +| EasyOCR | `--ocr-engine easyocr` (default) | No extra pip beyond docling defaults | +| RapidOCR | `--ocr-engine rapidocr` | Lightweight; see Docling notes on read-only FS | +| Tesseract (Python) | `--ocr-engine tesserocr` | Needs `pip install tesserocr` and system Tesseract | +| Tesseract (CLI) | `--ocr-engine tesseract` | Shells out to `tesseract` binary | +| macOS Vision | `--ocr-engine ocrmac` | macOS only | + +#### Python API ```python # EasyOCR (default — no extra install needed) @@ -70,21 +99,6 @@ from docling.datamodel.pipeline_options import OcrMacOptions opts = PdfPipelineOptions(do_ocr=True, ocr_options=OcrMacOptions()) ``` -### Standard pipeline + OCR: CLI vs Python - -`scripts/docling-convert.py` (`--pipeline standard`) maps engines like this: - -| Engine | CLI | Notes | -|--------|-----|--------| -| EasyOCR | `--ocr-engine easyocr` (default) | No extra pip beyond docling defaults | -| RapidOCR | `--ocr-engine rapidocr` | Lightweight; see Docling notes on read-only FS | -| Tesseract | `--ocr-engine tesseract` | Uses `TesseractOcrOptions` → needs **`pip install tesserocr`** and system Tesseract | -| macOS Vision | `--ocr-engine mac` | `OcrMacOptions` | - -**Tesseract without `tesserocr`:** Docling also provides `TesseractCliOcrOptions` (shell out to the `tesseract` binary). This helper CLI does not expose it yet; set `ocr_options=TesseractCliOcrOptions()` in Python if you only have the CLI installed. - -**NVIDIA Nemotron OCR:** Not exposed in `docling-convert.py` and not present in docling **2.81.x** `pipeline_options` on PyPI. If a future Docling release adds a Nemotron options class, configure `PdfPipelineOptions(ocr_options=...)` in Python (see [pipeline options](https://docling-project.github.io/docling/reference/pipeline_options/) for your installed version) or extend the CLI. - --- ## Pipeline 2: VLM Pipeline — local inference @@ -92,6 +106,18 @@ opts = PdfPipelineOptions(do_ocr=True, ocr_options=OcrMacOptions()) Processes each page as an image through a vision-language model. Replaces the standard layout detection + OCR stack entirely. +### CLI usage + +```bash +# Default VLM model (granite_docling) +docling report.pdf --pipeline vlm --output /tmp/ + +# Specific model +docling report.pdf --pipeline vlm --vlm-model smoldocling --output /tmp/ +``` + +### Python API + ```python from docling.document_converter import DocumentConverter, PdfFormatOption from docling.datamodel.base_models import InputFormat @@ -114,20 +140,20 @@ converter = DocumentConverter( ) ``` -### Available model presets (`vlm_model_specs`) +### Available model presets -| Preset | Model | Backend | Device | Notes | +| CLI `--vlm-model` | Python preset (`vlm_model_specs`) | Backend | Device | Notes | |---|---|---|---|---| -| `GRANITEDOCLING_TRANSFORMERS` | granite-docling-258M | HF Transformers | CPU/GPU | Default | -| `SMOLDOCLING_TRANSFORMERS` | smoldocling-256M | HF Transformers | CPU/GPU | Lighter | -| `GRANITEDOCLING_VLLM` | granite-docling-258M | vLLM | GPU | Fast batch | -| `GRANITEDOCLING_MLX` | granite-docling-258M-mlx | MLX | Apple MPS | M-series Macs | +| `granite_docling` | `GRANITEDOCLING_TRANSFORMERS` | HF Transformers | CPU/GPU | Default | +| `smoldocling` | `SMOLDOCLING_TRANSFORMERS` | HF Transformers | CPU/GPU | Lighter | +| (Python API only) | `GRANITEDOCLING_VLLM` | vLLM | GPU | Fast batch | +| (Python API only) | `GRANITEDOCLING_MLX` | MLX | Apple MPS | M-series Macs | ### Hybrid mode: PDF text + VLM for images/tables -Set `force_backend_text=True` to use deterministic text extraction for normal -text regions while routing images and tables through the VLM. Reduces -hallucination risk on text-heavy pages. +Set `force_backend_text=True` (Python API only) to use deterministic text +extraction for normal text regions while routing images and tables through the +VLM. Reduces hallucination risk on text-heavy pages. ```python pipeline_options = VlmPipelineOptions( @@ -144,6 +170,17 @@ pipeline_options = VlmPipelineOptions( Sends page images to any OpenAI-compatible endpoint. Works with vLLM, LM Studio, Ollama, or a hosted model API. +This is available via the CLI with `--pipeline vlm --enable-remote-services`, +but endpoint URL, model name, and API key configuration require the Python API. + +### CLI usage (basic) + +```bash +docling report.pdf --pipeline vlm --enable-remote-services --output /tmp/ +``` + +### Python API (full configuration) + ```python from docling.document_converter import DocumentConverter, PdfFormatOption from docling.datamodel.base_models import InputFormat diff --git a/docs/examples/agent_skill/docling-document-intelligence/scripts/docling-convert.py b/docs/examples/agent_skill/docling-document-intelligence/scripts/docling-convert.py deleted file mode 100644 index efc7fa3f67..0000000000 --- a/docs/examples/agent_skill/docling-document-intelligence/scripts/docling-convert.py +++ /dev/null @@ -1,360 +0,0 @@ -#!/usr/bin/env python3 -# SPDX-License-Identifier: MIT -""" -docling-convert.py — Parse a document and emit Markdown, JSON, or RAG chunks. - -Requires: docling>=2.81.0, docling-core>=2.67.1, packaging -Install: pip install -r scripts/requirements.txt (from the bundle root directory) - -Usage (from bundle root, i.e. docling-document-intelligence/): - python3 scripts/docling-convert.py [options] - -Arguments: - source Local file path or URL (required) - -Pipeline selection: - --pipeline standard | vlm-local | vlm-api (default: standard) - -Standard pipeline options: - --ocr-engine easyocr | tesseract | rapidocr | mac (default: easyocr) - --no-ocr Disable OCR entirely - --no-tables Skip table structure parsing (faster) - -VLM local pipeline options (--pipeline vlm-local): - --vlm-model granitedocling | smoldocling | granitedocling-vllm | granitedocling-mlx - (default: granitedocling) - --force-backend-text - Hybrid mode: use PDF text extraction for text, VLM for images/tables - -VLM API pipeline options (--pipeline vlm-api): - --vlm-api-url OpenAI-compatible endpoint (e.g. http://localhost:8000/v1/chat/completions) - --vlm-api-model Model name on the server (e.g. ibm-granite/granite-docling-258M) - --vlm-api-key API key if required (default: none) - -Output options: - --format markdown | json | chunks (default: markdown) - --max-tokens Max tokens per chunk (default: 512) - --tokenizer HuggingFace model id for chunking - --openai-model Use OpenAI tiktoken tokenizer for chunking - Requires: pip install 'docling-core[chunking-openai]' - --out Write output to file instead of stdout -""" - -import argparse -import json -import sys -from pathlib import Path - -MIN_DOCLING_VERSION = "2.81.0" -MIN_DOCLING_CORE_VERSION = "2.67.1" - - -def parse_args(): - p = argparse.ArgumentParser(description="Docling document converter") - p.add_argument("source", help="File path or URL") - - p.add_argument( - "--pipeline", choices=["standard", "vlm-local", "vlm-api"], default="standard" - ) - - p.add_argument( - "--ocr-engine", - choices=["easyocr", "tesseract", "rapidocr", "mac"], - default="easyocr", - ) - p.add_argument("--no-ocr", action="store_true") - p.add_argument("--no-tables", action="store_true") - - p.add_argument( - "--vlm-model", - choices=[ - "granitedocling", - "smoldocling", - "granitedocling-vllm", - "granitedocling-mlx", - ], - default="granitedocling", - ) - p.add_argument( - "--force-backend-text", - action="store_true", - help="Hybrid: PDF text for text regions, VLM for images/tables", - ) - - p.add_argument("--vlm-api-url", default="http://localhost:8000/v1/chat/completions") - p.add_argument("--vlm-api-model", default="ibm-granite/granite-docling-258M") - p.add_argument("--vlm-api-key", default=None) - - p.add_argument( - "--format", choices=["markdown", "json", "chunks"], default="markdown" - ) - p.add_argument("--max-tokens", type=int, default=512) - p.add_argument("--tokenizer", default="sentence-transformers/all-MiniLM-L6-v2") - p.add_argument("--openai-model", default=None) - p.add_argument("--out", default=None) - return p.parse_args() - - -def check_dependencies(): - from importlib.metadata import PackageNotFoundError, version as dist_version - - from packaging.version import Version - - missing: list[str] = [] - checks = [ - ("docling", "docling", MIN_DOCLING_VERSION), - ("docling_core", "docling-core", MIN_DOCLING_CORE_VERSION), - ] - for import_name, dist_name, min_ver in checks: - try: - __import__(import_name) - except ImportError: - missing.append(dist_name) - continue - try: - ver = dist_version(dist_name) - except PackageNotFoundError: - ver = "0.0.0" - if Version(ver) < Version(min_ver): - print( - f"WARNING: {dist_name}>={min_ver} recommended, found {ver}. " - f"Run: pip install --upgrade {dist_name}", - file=sys.stderr, - ) - if missing: - print( - f"ERROR: missing packages: {' '.join(missing)}\n" - f"Run: pip install -r scripts/requirements.txt (from the bundle root directory)", - file=sys.stderr, - ) - sys.exit(1) - - -def build_standard_converter(args): - from docling.datamodel.base_models import InputFormat - from docling.datamodel.pipeline_options import PdfPipelineOptions - from docling.document_converter import DocumentConverter, PdfFormatOption - - ocr_opts = None - if not args.no_ocr: - engine = args.ocr_engine - if engine == "tesseract": - from docling.datamodel.pipeline_options import TesseractOcrOptions - - ocr_opts = TesseractOcrOptions() - elif engine == "rapidocr": - from docling.datamodel.pipeline_options import RapidOcrOptions - - ocr_opts = RapidOcrOptions() - elif engine == "mac": - from docling.datamodel.pipeline_options import OcrMacOptions - - ocr_opts = OcrMacOptions() - - kwargs = dict( - do_ocr=not args.no_ocr, - do_table_structure=not args.no_tables, - ) - if ocr_opts is not None: - kwargs["ocr_options"] = ocr_opts - - pipeline_options = PdfPipelineOptions(**kwargs) - return DocumentConverter( - format_options={ - InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options), - } - ) - - -def build_vlm_local_converter(args): - from docling.datamodel import vlm_model_specs - from docling.datamodel.base_models import InputFormat - from docling.datamodel.pipeline_options import VlmPipelineOptions - from docling.document_converter import DocumentConverter, PdfFormatOption - from docling.pipeline.vlm_pipeline import VlmPipeline - - model_map = { - "granitedocling": vlm_model_specs.GRANITEDOCLING_TRANSFORMERS, - "smoldocling": vlm_model_specs.SMOLDOCLING_TRANSFORMERS, - "granitedocling-vllm": vlm_model_specs.GRANITEDOCLING_VLLM, - "granitedocling-mlx": vlm_model_specs.GRANITEDOCLING_MLX, - } - vlm_opts = model_map[args.vlm_model] - - pipeline_options = VlmPipelineOptions( - vlm_options=vlm_opts, - generate_page_images=True, - force_backend_text=args.force_backend_text, - ) - - return DocumentConverter( - format_options={ - InputFormat.PDF: PdfFormatOption( - pipeline_cls=VlmPipeline, - pipeline_options=pipeline_options, - ) - } - ) - - -def build_vlm_api_converter(args): - from docling.datamodel.base_models import InputFormat - from docling.datamodel.pipeline_options import VlmPipelineOptions - from docling.datamodel.pipeline_options_vlm_model import ( - ApiVlmOptions, - ResponseFormat, - ) - from docling.document_converter import DocumentConverter, PdfFormatOption - from docling.pipeline.vlm_pipeline import VlmPipeline - - headers = {} - if args.vlm_api_key: - headers["Authorization"] = f"Bearer {args.vlm_api_key}" - - vlm_opts = ApiVlmOptions( - url=args.vlm_api_url, - params=dict( - model=args.vlm_api_model, - max_tokens=4096, - ), - headers=headers if headers else None, - prompt="Convert this page to docling.", - response_format=ResponseFormat.DOCTAGS, - timeout=120, - scale=2.0, - ) - - pipeline_options = VlmPipelineOptions( - vlm_options=vlm_opts, - generate_page_images=True, - force_backend_text=args.force_backend_text, - enable_remote_services=True, - ) - - return DocumentConverter( - format_options={ - InputFormat.PDF: PdfFormatOption( - pipeline_cls=VlmPipeline, - pipeline_options=pipeline_options, - ) - } - ) - - -def build_tokenizer(hf_model_id: str, openai_model, max_tokens: int): - if openai_model: - try: - import tiktoken - from docling_core.transforms.chunker.tokenizer.openai import OpenAITokenizer - except ImportError: - print( - "ERROR: OpenAI tokenizer requires:\n" - " pip install 'docling-core[chunking-openai]'", - file=sys.stderr, - ) - sys.exit(1) - return OpenAITokenizer( - tokenizer=tiktoken.encoding_for_model(openai_model), - max_tokens=max_tokens, - ) - from docling_core.transforms.chunker.tokenizer.huggingface import ( - HuggingFaceTokenizer, - ) - - return HuggingFaceTokenizer.from_pretrained( - model_name=hf_model_id, - max_tokens=max_tokens, - ) - - -def output_markdown(doc) -> str: - return doc.export_to_markdown() - - -def output_json(doc) -> str: - return json.dumps(doc.export_to_dict(), indent=2, ensure_ascii=False) - - -def output_chunks(doc, tokenizer) -> str: - from docling.chunking import HybridChunker - - chunker = HybridChunker(tokenizer=tokenizer, merge_peers=True) - chunks = list(chunker.chunk(doc)) - texts = [chunker.contextualize(c) for c in chunks] - - try: - counts = [tokenizer.count_tokens(t) for t in texts] - stats = ( - f"chunks={len(chunks)} " - f"min={min(counts)} max={max(counts)} " - f"avg={sum(counts) // len(counts)}" - ) - except Exception: - stats = f"chunks={len(chunks)}" - - lines = [f"# Chunks ({stats})", ""] - for i, (chunk, text) in enumerate(zip(chunks, texts)): - headings = ( - " > ".join(chunk.meta.headings) if chunk.meta.headings else "(no heading)" - ) - lines += [f"## Chunk {i + 1} | {headings}", "", text, "", "---", ""] - return "\n".join(lines) - - -def page_count(doc) -> int: - pages = set() - for item, _ in doc.iterate_items(): - for prov in getattr(item, "prov", []): - pages.add(prov.page_no) - return len(pages) - - -def main(): - args = parse_args() - check_dependencies() - - tokenizer = None - if args.format == "chunks": - tokenizer = build_tokenizer(args.tokenizer, args.openai_model, args.max_tokens) - - if args.pipeline == "standard": - converter = build_standard_converter(args) - print( - f"Pipeline: standard (ocr={not args.no_ocr}, engine={args.ocr_engine})", - file=sys.stderr, - ) - elif args.pipeline == "vlm-local": - converter = build_vlm_local_converter(args) - print( - f"Pipeline: vlm-local (model={args.vlm_model}, " - f"force_backend_text={args.force_backend_text})", - file=sys.stderr, - ) - elif args.pipeline == "vlm-api": - converter = build_vlm_api_converter(args) - print( - f"Pipeline: vlm-api (url={args.vlm_api_url}, model={args.vlm_api_model})", - file=sys.stderr, - ) - - print(f"Converting: {args.source}", file=sys.stderr) - result = converter.convert(args.source) - doc = result.document - print(f"Pages processed: {page_count(doc)}", file=sys.stderr) - - if args.format == "markdown": - output = output_markdown(doc) - elif args.format == "json": - output = output_json(doc) - else: - output = output_chunks(doc, tokenizer) - - if args.out: - Path(args.out).write_text(output, encoding="utf-8") - print(f"Written to {args.out}", file=sys.stderr) - else: - print(output) - - -if __name__ == "__main__": - main() diff --git a/docs/examples/agent_skill/docling-document-intelligence/scripts/docling-evaluate.py b/docs/examples/agent_skill/docling-document-intelligence/scripts/docling-evaluate.py index 534b9eff81..71aef1c835 100644 --- a/docs/examples/agent_skill/docling-document-intelligence/scripts/docling-evaluate.py +++ b/docs/examples/agent_skill/docling-document-intelligence/scripts/docling-evaluate.py @@ -3,10 +3,11 @@ """ Evaluate a Docling JSON export and suggest pipeline / option changes. -Typical flow (agent or human), from bundle root: +Typical flow (agent or human): - python3 scripts/docling-convert.py input.pdf --format json --out doc.json - python3 scripts/docling-evaluate.py doc.json [--markdown out.md] + docling input.pdf --to json --output /tmp/ + docling input.pdf --to md --output /tmp/ + python3 scripts/docling-evaluate.py /tmp/input.json --markdown /tmp/input.md Exit codes: 0 = pass; 1 = fail or --fail-on-warn with status warn """ @@ -142,7 +143,7 @@ def evaluate( if m.get("heuristic_only"): issues.append("Could not load full DoclingDocument; metrics are partial.") actions.append( - "Ensure docling-core matches export; re-export with scripts/docling-convert.py --format json" + "Ensure docling-core matches export; re-export with: docling --to json --output " ) cpp = m.get("chars_per_page") or 0 @@ -151,42 +152,42 @@ def evaluate( f"Low text density ({cpp} chars/page); likely scan, image-heavy PDF, or extraction gap." ) actions.append( - "Retry: standard pipeline with --ocr-engine tesseract, rapidocr, or mac" + "Retry: docling --ocr-engine tesserocr (or rapidocr, ocrmac)" ) - actions.append("Retry: --pipeline vlm-local (or vlm-api if GPU/API available)") + actions.append("Retry: docling --pipeline vlm") if m.get("replacement_chars", 0) > 5: issues.append( "Unicode replacement characters detected; OCR may be garbling text." ) - actions.append("Retry: --ocr-engine tesseract or rapidocr") + actions.append("Retry: docling --ocr-engine tesserocr (or rapidocr)") actions.append( - "Retry: --pipeline vlm-local --force-backend-text for hybrid text+VLM" + "Retry: docling --pipeline vlm (use force_backend_text=True via Python API for hybrid)" ) if m.get("duplicate_heavy") or (m.get("most_repeated_text_count", 0) > 8): issues.append( "Repeated text blocks; possible layout/OCR loop or bad reading order." ) - actions.append("Retry: --pipeline vlm-local for complex layout") - actions.append("If using VLM: try --force-backend-text for text-heavy pages") + actions.append("Retry: docling --pipeline vlm") + actions.append("If using VLM: try force_backend_text=True via Python API for text-heavy pages") if expect_tables and m.get("tables", 0) == 0: issues.append("No tables detected but tables were expected.") - actions.append("Retry: standard pipeline without --no-tables") - actions.append("Retry: --pipeline vlm-local for merged-cell or visual tables") + actions.append("Retry: docling (tables are enabled by default; remove --no-tables if set)") + actions.append("Retry: docling --pipeline vlm (better for merged-cell or visual tables)") mc = m.get("markdown_chars", 0) if mc > 0 and mc < min_markdown_chars and m.get("page_count", 0) >= 1: issues.append(f"Markdown export is very short ({mc} chars) for the page count.") - actions.append("Retry: OCR/VLM pipelines as above") + actions.append("Retry: docling --pipeline vlm (or try different --ocr-engine)") if m.get("text_items", 0) == 0 and m.get("page_count", 0) == 0: issues.append( "No text items and no page provenance; export may be empty or invalid." ) actions.append( - "Verify source file opens correctly; retry with explicit --pipeline standard" + "Verify source file opens correctly; retry with: docling --pipeline standard" ) seen = set() @@ -258,7 +259,7 @@ def main() -> None: "issues": issues, "recommended_actions": actions, "next_steps_for_agent": [ - "Re-run scripts/docling-convert.py with flags from recommended_actions.", + "Re-run docling with flags from recommended_actions.", "Re-export JSON and run this script again until status is pass.", "Append a row to improvement-log.md (see SKILL.md).", ], diff --git a/docs/examples/agent_skill/docling-document-intelligence/scripts/requirements.txt b/docs/examples/agent_skill/docling-document-intelligence/scripts/requirements.txt index b4272960a5..21a5e56c5f 100644 --- a/docs/examples/agent_skill/docling-document-intelligence/scripts/requirements.txt +++ b/docs/examples/agent_skill/docling-document-intelligence/scripts/requirements.txt @@ -1,4 +1,3 @@ -# From bundle root: pip install -r scripts/requirements.txt +# pip install -r scripts/requirements.txt docling>=2.81.0 docling-core>=2.67.1 -packaging>=23.0 From def14165581972ef10c2a08af6a9bf067203496a Mon Sep 17 00:00:00 2001 From: jehlum11 Date: Thu, 2 Apr 2026 08:34:54 -0400 Subject: [PATCH 5/5] docs: fix ruff format on docling-evaluate.py Signed-off-by: jehlum11 Made-with: Cursor --- .../scripts/docling-evaluate.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/docs/examples/agent_skill/docling-document-intelligence/scripts/docling-evaluate.py b/docs/examples/agent_skill/docling-document-intelligence/scripts/docling-evaluate.py index 71aef1c835..a6d9d11d0a 100644 --- a/docs/examples/agent_skill/docling-document-intelligence/scripts/docling-evaluate.py +++ b/docs/examples/agent_skill/docling-document-intelligence/scripts/docling-evaluate.py @@ -170,17 +170,25 @@ def evaluate( "Repeated text blocks; possible layout/OCR loop or bad reading order." ) actions.append("Retry: docling --pipeline vlm") - actions.append("If using VLM: try force_backend_text=True via Python API for text-heavy pages") + actions.append( + "If using VLM: try force_backend_text=True via Python API for text-heavy pages" + ) if expect_tables and m.get("tables", 0) == 0: issues.append("No tables detected but tables were expected.") - actions.append("Retry: docling (tables are enabled by default; remove --no-tables if set)") - actions.append("Retry: docling --pipeline vlm (better for merged-cell or visual tables)") + actions.append( + "Retry: docling (tables are enabled by default; remove --no-tables if set)" + ) + actions.append( + "Retry: docling --pipeline vlm (better for merged-cell or visual tables)" + ) mc = m.get("markdown_chars", 0) if mc > 0 and mc < min_markdown_chars and m.get("page_count", 0) >= 1: issues.append(f"Markdown export is very short ({mc} chars) for the page count.") - actions.append("Retry: docling --pipeline vlm (or try different --ocr-engine)") + actions.append( + "Retry: docling --pipeline vlm (or try different --ocr-engine)" + ) if m.get("text_items", 0) == 0 and m.get("page_count", 0) == 0: issues.append(