feat: add html export skill for confluence

Dorin POMIAN · Dorin POMIAN · commit 2e56a46c7322 · 2026-04-28T11:06:50.000+02:00
diff --git a/README.md b/README.md
@@ -76,7 +76,7 @@ uv run --directory src docs2vecs indexer --config ~/Downloads/sw_export_temp/con
 
 The config yaml file is validated against [this schema](./src/docs2vecs/subcommands/indexer/config/config_schema.yaml).
 
-Please check [sample config file 1](docs/readme/sample-config-file-1.yml), [sample config file 2](docs/readme/sample-config-file-2.yml) for your reference.
+Please check [sample config file 1](docs/readme/sample-config-file-1.yml), [sample config file 2](docs/readme/sample-config-file-2.yml), [sample config file 3](docs/readme/sample-config-file-3.yml) for your reference.
 
 </details>
 
@@ -166,7 +166,7 @@ Please note that **api keys** should **NOT** be stored in config files, and shou
 
 Make sure you export the environment variables before you run the indexer. For convenience you can use the `--env` argument to supply your own `.env` file.
 
-Generate and use Scroll Word Exporter API tokens from the Personal Settings section of your Confluence profile.
+Generate and use Scroll Word Exporter API tokens from the Personal Settings section of your Confluence profile. For the Scroll HTML Exporter, generate a separate token from Personal Settings → Scroll HTML Exporter API Tokens. Note that each Scroll exporter (Word, HTML, PDF) requires its own token, and you must use the correct regional endpoint (US or EU/Germany).
 
 ## Experimental features
 <details><summary>Tracker</summary>
diff --git a/docs/readme/indexer-skills.md b/docs/readme/indexer-skills.md
@@ -20,27 +20,37 @@ This document describes all available skills that can be used in the indexer pip
    4. An `embedding` to generate embeddings from the chunks.
    5. A `vector-store` to store the embeddings.
 
-3. You have a list of jira tickets that you'd like to vectorize? You'll typically need the following type of skills in your config file:
+3. You have Confluence pages and want to export them as HTML, convert to Markdown, then vectorize? You'll typically need:
+
+   1. A `scrollhtml-exporter` to export the Confluence pages as HTML (ZIP).
+   2. A `confluence-html-to-markdown` transformer to convert the HTML export into self-contained Markdown (with images).
+   3. A `file-scanner` to pick up the resulting `.md` files.
+   4. A `file-reader` to read the Markdown content.
+   5. A `splitter` to split the documents into chunks.
+   6. An `embedding` to generate embeddings from the chunks.
+   7. A `vector-store` to store the embeddings.
+
+4. You have a list of jira tickets that you'd like to vectorize? You'll typically need the following type of skills in your config file:
 
    1. A `jira-loader` to extract the data from the jira tickets
    2. A `splitter` to split the data into chunks.
    3. An `embedding` to generate embeddings from the chunks.
    4. A `vector-store` to store the embeddings.
 
-4. You have FAQ documents exported from Confluence (`.docx` files) and want to extract Q&A pairs for vectorization? You'll typically need:
+5. You have FAQ documents exported from Confluence (`.docx` files) and want to extract Q&A pairs for vectorization? You'll typically need:
 
    1. An `exporter` (Scroll Word) or `file-scanner` to get the `.docx` files.
    2. A `confluence-faq-splitter` to extract Q&A pairs directly from the `.docx` headings.
    3. An `embedding` to generate embeddings from the Q&A chunks.
    4. A `vector-store` to store the embeddings.
 
-5. You have enriched Q&A JSON output from a Teams FAQ pipeline and want to index it? You'll typically need:
+6. You have enriched Q&A JSON output from a Teams FAQ pipeline and want to index it? You'll typically need:
 
    1. A `teams-qna-loader` to load the enriched Q&A pairs from the JSON file.
    2. An `embedding` to generate embeddings from the Q&A content.
    3. A `vector-store` to store the embeddings.
 
-6. You want to avoid re-running expensive embedding and indexing when the content hasn't changed since the last run? Insert a `writer` (`json-writer`) skill as a change gate:
+7. You want to avoid re-running expensive embedding and indexing when the content hasn't changed since the last run? Insert a `writer` (`json-writer`) skill as a change gate:
 
    1. A `file-scanner` (or `exporter`) to locate/export your source documents.
    2. A `file-reader` to read their content.
@@ -81,6 +91,53 @@ Exports Confluence pages to Microsoft Word documents. Each entry in `page_urls`
           - id: 1234567890
             # no tag — falls back to top-level tag
 ```
+
+### Scroll HTML Exporter
+Exports Confluence pages as HTML via the K15t Scroll HTML Exporter REST API. The export is downloaded as a ZIP and extracted locally. This is typically followed by the `confluence-html-to-markdown` transformer skill.
+
+API tokens are exporter-specific — you need a **Scroll HTML Exporter** token (User Profile → Personal settings → Scroll HTML Exporter API Tokens).
+
+Data residency: use the correct regional endpoint:
+- US: `https://scroll-html.us.exporter.k15t.app/api/public/1/exports`
+- EU/Germany: `https://scroll-html.de.exporter.k15t.app/api/public/1/exports`
+
+```yaml
+- skill: &ScrollHTMLExporter
+    type: exporter
+    name: scrollhtml-exporter
+    params:
+        api_url: https://scroll-html.de.exporter.k15t.app/api/public/1/exports  # Use .us. for US region
+        auth_token: env.SCROLL_HTML_EXPORTER_TOKEN  # Scroll HTML Exporter API token
+        poll_interval: 2    # Interval in seconds to check the status of the export
+        export_folder: ~/Downloads/html_export  # Path where the exported ZIP is extracted
+        scope: current  # Possible values: [current | descendants | document]
+        template_id: com.k15t.scroll.html.helpcenter  # Optional: defaults to the bundled Help Center template
+        confluence_prefix: https://your-instance.atlassian.net/wiki  # Optional: used to build source_url
+        tag: my-docs  # Optional: default tag for all pages
+        page_ids:
+          - id: 1436680207
+            tag: copilot-docs  # Optional
+        page_urls:
+          - url: https://your-instance.atlassian.net/wiki/spaces/SPACE/pages/123/Page+Title
+```
+</details>
+
+<details><summary>Transformer Skills</summary>
+Transform data from one format to another on disk. Transformers sit between exporters and file-scanners in the pipeline.
+
+### Confluence HTML to Markdown
+Converts a Scroll HTML export folder into self-contained Markdown files. Images referenced in the pages are copied into an `images/` sub-folder so the output is portable without the original HTML.
+
+Typically used after `scrollhtml-exporter` and before `file-scanner`.
+
+```yaml
+- skill: &HtmlToMarkdown
+    type: transformer
+    name: confluence-html-to-markdown
+    params:
+        input_dir: ~/Downloads/html_export/1436680207  # Path to the extracted Scroll HTML export
+        output_dir: ~/Downloads/html_export/1436680207/markdown  # Optional: defaults to <input_dir>/markdown
+```
 </details>
 
 <details><summary>File Scanner Skills</summary>
diff --git a/docs/readme/sample-config-file-3.yml b/docs/readme/sample-config-file-3.yml
@@ -0,0 +1,63 @@
+definitions:
+  - skill: &ScrollHTMLExporter
+      type: exporter
+      name: scrollhtml-exporter
+      params:
+        api_url: https://scroll-html.de.exporter.k15t.app/api/public/1/exports
+        auth_token: env.SCROLL_HTML_EXPORTER_TOKEN
+        poll_interval: 2
+        export_folder: ~/Downloads/html_export
+        scope: current
+        confluence_prefix: https://amadeus.atlassian.net/wiki
+        page_ids:
+          - id: 1436680207
+
+  - skill: &HtmlToMarkdown
+      type: transformer
+      name: confluence-html-to-markdown
+      params:
+        input_dir: ~/Downloads/html_export/1436680207
+        output_dir: ~/Downloads/html_export/1436680207/markdown
+
+  - skill: &FileScanner
+      type: file-scanner
+      name: multi-file-scanner
+      params:
+        path: ~/Downloads/html_export/1436680207/markdown
+        filter: ["*.md"]
+        recursive: false
+
+  - skill: &FileReader
+      type: file-reader
+      name: multi-file-reader
+
+  - skill: &TextSplitter
+      type: splitter
+      name: recursive-character-splitter
+      params:
+        chunk_size: 1200
+        overlap: 200
+
+  - skill: &FastEmbed
+      type: embedding
+      name: llama-fastembed
+
+  - skill: &ChromaDbVectorStore
+      type: vector-store
+      name: chromadb
+      params:
+        db_path: ~/Downloads/html_export/chroma_db
+        collection_name: confluence-html-export
+
+  - skillset: &Pipeline
+      - *ScrollHTMLExporter
+      - *HtmlToMarkdown
+      - *FileScanner
+      - *FileReader
+      - *TextSplitter
+      - *FastEmbed
+      - *ChromaDbVectorStore
+
+indexer:
+  id: confluence-html-to-vectorstore
+  skillset: *Pipeline
diff --git a/pyproject.toml b/pyproject.toml
@@ -13,6 +13,7 @@ dependencies = [
     "azure-identity>=1.15.0",
     "azure-search-documents>=11.5.2",
     "azure-storage-blob>=12.24.1",
+    "beautifulsoup4>=4.12",
     "cerberus>=1.3.7",
     "chromadb>=0.6.3",
     "jira>=3.8.0",
@@ -26,6 +27,7 @@ dependencies = [
     "llama-index-retrievers-bm25>=0.5.2",
     "llama-index-vector-stores-chroma>=0.4.1",
     "markdown>=3.7",
+    "markdownify>=0.14",
     "openpyxl>=3.1.5",
     "pymongo>=4.11.1",
     "pystemmer>=2.2.0.3",
diff --git a/src/docs2vecs/subcommands/indexer/config/config_schema.yaml b/src/docs2vecs/subcommands/indexer/config/config_schema.yaml
@@ -96,7 +96,7 @@ definitions:
               scope:
                 type: string
                 required: False
-                allowed: ['descendants', 'current']
+                allowed: ['descendants', 'current', 'document']
               recursive:
                 type: boolean
               filter:
@@ -265,6 +265,9 @@ definitions:
                 type: integer
                 required: False
                 min: 1
+              template_id:
+                type: string
+                required: False
 
       skillset:
         type: list
diff --git a/src/docs2vecs/subcommands/indexer/skills/__init__.py b/src/docs2vecs/subcommands/indexer/skills/__init__.py
@@ -2,6 +2,7 @@
 from .azure_vector_store_skill import AzureVectorStoreSkill
 from .document_intelligence_skill import AzureDocumentIntelligenceSkill
 from .jira_loader_skill import JiraLoaderSkill
+from .scrollhtmlexporter_skill import ScrollHTMLExporterSkill
 from .scrollwordexporter_skill import ScrollWorldExporterSkill
 from .chromadb_vector_store_skill import ChromaDBVectorStoreSkill
 from .tracker import VectorStoreTracker
@@ -25,6 +26,7 @@
     "AzureDocumentIntelligenceSkill",
     "AzureVectorStoreSkill",
     "JiraLoaderSkill",
+    "ScrollHTMLExporterSkill",
     "ScrollWorldExporterSkill",
     "VectorStoreTracker",
     "ChromaDBVectorStoreSkill",
diff --git a/src/docs2vecs/subcommands/indexer/skills/factory.py b/src/docs2vecs/subcommands/indexer/skills/factory.py
@@ -12,6 +12,7 @@
 from docs2vecs.subcommands.indexer.skills import JiraLoaderSkill
 from docs2vecs.subcommands.indexer.skills import LlamaFastembedEmbeddingSkill
 from docs2vecs.subcommands.indexer.skills import RecursiveCharacterTextSplitter
+from docs2vecs.subcommands.indexer.skills import ScrollHTMLExporterSkill
 from docs2vecs.subcommands.indexer.skills import ScrollWorldExporterSkill
 from docs2vecs.subcommands.indexer.skills import SemanticSplitter
 from docs2vecs.subcommands.indexer.skills import VectorStoreTracker
@@ -39,6 +40,7 @@ class SkillType(StrEnum):
 class AvailableSkillName(StrEnum):
     # exporters
     SCROLLWORD_EXPORTER = "scrollword-exporter"
+    SCROLLHTML_EXPORTER = "scrollhtml-exporter"
 
     # file readers
     AZ_DOCUMENT_INTELLIGENCE = "azure-document-intelligence"
@@ -79,6 +81,7 @@ class AvailableSkillName(StrEnum):
 AVAILABLE_SKILLS = {
     SkillType.EXPORTER: {
         AvailableSkillName.SCROLLWORD_EXPORTER: ScrollWorldExporterSkill,
+        AvailableSkillName.SCROLLHTML_EXPORTER: ScrollHTMLExporterSkill,
     },
     SkillType.FILE_SCANNER: {AvailableSkillName.MULTI_FILE_SCANNER: FileScannerSkill},
     SkillType.FILE_READER: {
diff --git a/src/docs2vecs/subcommands/indexer/skills/scrollhtmlexporter_skill.py b/src/docs2vecs/subcommands/indexer/skills/scrollhtmlexporter_skill.py
diff --git a/uv.lock b/uv.lock