agent-sdk-docs-generate #19
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # This workflow generates airbyte-agent-sdk API documentation using pdoc3 | |
| # and commits the generated markdown files to the docs/ai-agents/reference/sdk | |
| # directory. | |
| # | |
| # The SDK lives inside the private airbytehq/sonar monorepo under | |
| # connector-sdk/. This workflow always builds docs from the tip of sonar's | |
| # default branch (main) — there is no versioned release stream for the SDK | |
| # today and we intentionally track main as the source of truth. | |
| # | |
| # The workflow can be triggered: | |
| # - Manually via workflow_dispatch | |
| # - On a schedule (daily) | |
| # | |
| # pdoc3 generates markdown output when --output-dir is specified without | |
| # --html or --pdf flags. See: https://github.com/pdoc3/pdoc/issues/257 | |
| # | |
| # The MDX sanitization logic mirrors pyairbyte-docs-generate.yml so both | |
| # pipelines produce output that Docusaurus/MDX can compile cleanly. | |
| name: agent-sdk-docs-generate | |
| on: | |
| workflow_dispatch: | |
| schedule: | |
| # Run daily at 00:00 UTC so Docusaurus reflects the latest sonar main | |
| # within 24h of any SDK change. | |
| - cron: "0 0 * * *" | |
| jobs: | |
| generate-docs: | |
| name: agent-sdk-docs-generate | |
| runs-on: ubuntu-24.04 | |
| permissions: | |
| contents: write | |
| pull-requests: write | |
| steps: | |
| - name: Checkout Airbyte Repository | |
| uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 | |
| with: | |
| fetch-depth: 1 | |
| token: ${{ secrets.GITHUB_TOKEN }} | |
| - name: Authenticate as GitHub App (sonar read + airbyte write) | |
| uses: actions/create-github-app-token@f8d387b68d61c58ab83c6c016672934102569859 # v3.0.0 | |
| id: get-app-token | |
| with: | |
| owner: "airbytehq" | |
| repositories: "airbyte,sonar" | |
| app-id: ${{ secrets.OCTAVIA_BOT_APP_ID }} | |
| private-key: ${{ secrets.OCTAVIA_BOT_PRIVATE_KEY }} | |
| - name: Set up Python | |
| uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6.2.0 | |
| with: | |
| python-version: "3.13" | |
| - name: Set up uv | |
| uses: astral-sh/setup-uv@f06b870e0a91d23284a3013acc55e6f88ab4b904 # v7 | |
| - name: Clone sonar Repository at main | |
| run: | | |
| # Track main as the source of truth — no version pin. | |
| git clone --depth 1 --branch main \ | |
| "https://x-access-token:${{ steps.get-app-token.outputs.token }}@github.com/airbytehq/sonar.git" \ | |
| sonar-source | |
| cd sonar-source | |
| SHORT_SHA=$(git rev-parse --short HEAD) | |
| echo "sonar_sha=${SHORT_SHA}" >> $GITHUB_OUTPUT | |
| echo "Cloned sonar@${SHORT_SHA}" | |
| id: clone | |
| - name: Generate Typed Connector Modules | |
| working-directory: sonar-source | |
| run: | | |
| # Mirrors connector-sdk-pdoc-publish.yml: populate the typed | |
| # connector modules into airbyte_agent_sdk/ before running pdoc3 | |
| # so the reference includes per-connector typed APIs. | |
| uv sync --project connector-sdk --extra dev --dev | |
| ./scripts/connectors/generate-sdk.sh | |
| - name: Install airbyte-agent-sdk and pdoc3 | |
| run: | | |
| pip install -e sonar-source/connector-sdk | |
| pip install pdoc3 | |
| - name: Generate API Documentation | |
| run: | | |
| # Create the reference directory if it doesn't exist | |
| mkdir -p docs/ai-agents/reference/sdk | |
| # Generate markdown documentation using pdoc3 | |
| # Without --html or --pdf flags, pdoc3 outputs Markdown-Extra format | |
| # See: https://github.com/pdoc3/pdoc/issues/257 | |
| pdoc3 --force --output-dir docs/ai-agents/reference/sdk \ | |
| --config show_source_code=False \ | |
| --config sort_identifiers=True \ | |
| --config show_type_annotations=True \ | |
| airbyte_agent_sdk | |
| echo "Generated documentation files:" | |
| find docs/ai-agents/reference/sdk -name "*.md" | head -20 | |
| - name: Add Docusaurus Frontmatter | |
| run: | | |
| # Add Docusaurus-compatible frontmatter to generated markdown files. | |
| # Use a unique id and title derived from file path to avoid | |
| # translation-key conflicts across the generated tree. | |
| # See: https://github.com/facebook/docusaurus/discussions/11458 | |
| # Skip the hand-written landing readme (see commit adding it). | |
| base_dir="docs/ai-agents/reference/sdk" | |
| # Use -print0 / read -d '' so paths containing whitespace or glob | |
| # characters are handled safely. | |
| find "$base_dir" -name "*.md" ! -iname "readme.md" -print0 | while IFS= read -r -d '' file; do | |
| rel_path="${file#$base_dir/}" | |
| module_path="${rel_path%.md}" | |
| module_title=$(echo "$module_path" | tr '/' '.') | |
| unique_id=$(echo "$module_path" | tr '/' '-') | |
| { | |
| echo "---" | |
| echo "id: ${unique_id}" | |
| echo "title: ${module_title}" | |
| echo "---" | |
| echo "" | |
| cat "$file" | |
| } > "${file}.tmp" | |
| mv "${file}.tmp" "$file" | |
| done | |
| echo "Added Docusaurus frontmatter to all generated files (excluding readme.md)" | |
| - name: Resolve pdoc3 Cross-References | |
| run: | | |
| # pdoc3 generates same-page anchor links (#SymbolName) that assume | |
| # every symbol lives on a single HTML page. In Docusaurus each | |
| # module is its own page, so these anchors are broken. | |
| # | |
| # This step: | |
| # 1. Adds <a id="SymbolName"></a> anchors before each | |
| # function/class definition so same-page links work. | |
| # 2. Rewrites cross-page #anchor links to relative file paths | |
| # (e.g. index.md#connect) so inter-module references work. | |
| python3 << 'PYEOF' | |
| import re | |
| from collections import defaultdict | |
| from pathlib import Path | |
| base_dir = Path("docs/ai-agents/reference/sdk") | |
| # ── Build symbol index ────────────────────────────────────── | |
| symbol_files: dict[str, set[str]] = defaultdict(set) | |
| file_symbols: dict[str, set[str]] = defaultdict(set) | |
| for md_path in base_dir.rglob("*.md"): | |
| if md_path.name.lower() == "readme.md": | |
| continue | |
| rel = str(md_path.relative_to(base_dir)) | |
| content = md_path.read_text(encoding="utf-8") | |
| for m in re.finditer(r"^`(\w+)\(", content, re.MULTILINE): | |
| sym = m.group(1) | |
| symbol_files[sym].add(rel) | |
| file_symbols[rel].add(sym) | |
| print(f"Indexed {len(symbol_files)} symbols across {len(file_symbols)} files") | |
| # ── Add anchors and resolve links ─────────────────────────── | |
| for md_path in base_dir.rglob("*.md"): | |
| if md_path.name.lower() == "readme.md": | |
| continue | |
| rel = str(md_path.relative_to(base_dir)) | |
| content = md_path.read_text(encoding="utf-8") | |
| original = content | |
| # 1. Insert HTML anchor IDs before symbol definitions. | |
| def _add_anchor(m: re.Match) -> str: | |
| return f'<a id="{m.group(1)}"></a>\n\n`{m.group(1)}(' | |
| content = re.sub( | |
| r"^`(\w+)\(", _add_anchor, content, flags=re.MULTILINE, | |
| ) | |
| # 2. Resolve [text](#anchor) links. | |
| current_dir = str(Path(rel).parent) | |
| def _resolve_link(m: re.Match) -> str: | |
| text, anchor = m.group(1), m.group(2) | |
| # Same-page symbol — keep the anchor link as-is. | |
| if anchor in file_symbols[rel]: | |
| return m.group(0) | |
| candidates = symbol_files.get(anchor) | |
| if not candidates: | |
| # Unknown symbol — drop the link, keep display text. | |
| return text | |
| # Pick the best target: prefer index.md in the same | |
| # directory, then any index.md, then first alphabetically. | |
| target = None | |
| for c in sorted(candidates): | |
| if ( | |
| Path(c).name == "index.md" | |
| and str(Path(c).parent) == current_dir | |
| ): | |
| target = c | |
| break | |
| if target is None: | |
| for c in sorted(candidates): | |
| if Path(c).name == "index.md": | |
| target = c | |
| break | |
| if target is None: | |
| target = sorted(candidates)[0] | |
| from os.path import relpath | |
| target_rel = relpath(target, current_dir) | |
| return f"[{text}]({target_rel}#{anchor})" | |
| content = re.sub( | |
| r"\[([^\]]+)\]\(#(\w+)\)", _resolve_link, content, | |
| ) | |
| if content != original: | |
| md_path.write_text(content, encoding="utf-8") | |
| print(f"Resolved: {md_path}") | |
| print("Cross-reference resolution complete") | |
| PYEOF | |
| - name: Sanitize for MDX Compatibility | |
| run: | | |
| # Docusaurus uses MDX which interprets { and } as JSX expressions, | |
| # and <...> patterns as JSX tags. We escape these in the generated | |
| # markdown to prevent MDX compilation errors. Logic mirrors | |
| # pyairbyte-docs-generate.yml so both pipelines stay in sync. | |
| python3 << 'EOF' | |
| import os | |
| import re | |
| # HTML tags that should NOT be escaped (legitimate HTML in Markdown) | |
| ALLOWED_HTML_TAGS = { | |
| 'a', 'abbr', 'address', 'article', 'aside', 'audio', 'b', 'bdi', 'bdo', | |
| 'blockquote', 'br', 'button', 'canvas', 'caption', 'cite', 'code', 'col', | |
| 'colgroup', 'data', 'datalist', 'dd', 'del', 'details', 'dfn', 'dialog', | |
| 'div', 'dl', 'dt', 'em', 'embed', 'fieldset', 'figcaption', 'figure', | |
| 'footer', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hr', | |
| 'i', 'iframe', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', | |
| 'main', 'map', 'mark', 'menu', 'meter', 'nav', 'noscript', 'object', | |
| 'ol', 'optgroup', 'option', 'output', 'p', 'picture', 'pre', 'progress', | |
| 'q', 'rp', 'rt', 'ruby', 's', 'samp', 'section', 'select', 'small', | |
| 'source', 'span', 'strong', 'sub', 'summary', 'sup', 'table', 'tbody', | |
| 'td', 'template', 'textarea', 'tfoot', 'th', 'thead', 'time', 'tr', | |
| 'track', 'u', 'ul', 'var', 'video', 'wbr', | |
| } | |
| def is_allowed_html_tag(match_text): | |
| """Check if a <...> pattern is a legitimate HTML tag. | |
| Case-sensitive: uppercase tokens like `<DATA>` in docstrings | |
| are almost always placeholders, not real HTML. | |
| """ | |
| tag_match = re.match(r'</?([a-z][a-z0-9]*)', match_text) | |
| if tag_match: | |
| return tag_match.group(1) in ALLOWED_HTML_TAGS | |
| return False | |
| def escape_angle_brackets(text): | |
| """Escape angle brackets that look like JSX but aren't valid HTML tags.""" | |
| def replace_angles(match): | |
| content = match.group(0) | |
| if is_allowed_html_tag(content): | |
| return content | |
| return content.replace('<', '<').replace('>', '>') | |
| pattern = r'<[^>]+>' | |
| out = re.sub(pattern, replace_angles, text) | |
| # MDX treats '<' as a tag opener only when followed by a letter | |
| # or '/'. Anything else (e.g. `field <= value`) needs escaping. | |
| out = re.sub(r'<(?![a-zA-Z/])', '<', out) | |
| return out | |
| def sanitize_for_mdx(content): | |
| """Sanitize content for MDX compatibility, preserving code blocks and frontmatter.""" | |
| lines = content.split('\n') | |
| result = [] | |
| in_code_block = False | |
| in_frontmatter = False | |
| frontmatter_count = 0 | |
| for line in lines: | |
| if line.strip() == '---': | |
| frontmatter_count += 1 | |
| if frontmatter_count == 1: | |
| in_frontmatter = True | |
| elif frontmatter_count == 2: | |
| in_frontmatter = False | |
| result.append(line) | |
| continue | |
| if in_frontmatter: | |
| result.append(line) | |
| continue | |
| if line.strip().startswith('```') or line.strip().startswith('~~~'): | |
| in_code_block = not in_code_block | |
| result.append(line) | |
| continue | |
| if in_code_block: | |
| result.append(line) | |
| continue | |
| parts = re.split(r'(`[^`]+`)', line) | |
| escaped_parts = [] | |
| for part in parts: | |
| if part.startswith('`') and part.endswith('`'): | |
| escaped_parts.append(part) | |
| else: | |
| part = re.sub(r'(?<!\\)\{', r'\\{', part) | |
| part = re.sub(r'(?<!\\)\}', r'\\}', part) | |
| part = escape_angle_brackets(part) | |
| escaped_parts.append(part) | |
| result.append(''.join(escaped_parts)) | |
| return '\n'.join(result) | |
| base_dir = 'docs/ai-agents/reference/sdk' | |
| for root, dirs, files in os.walk(base_dir): | |
| for filename in files: | |
| if filename.endswith('.md') and filename.lower() != 'readme.md': | |
| filepath = os.path.join(root, filename) | |
| with open(filepath, 'r', encoding='utf-8') as f: | |
| content = f.read() | |
| sanitized_content = sanitize_for_mdx(content) | |
| with open(filepath, 'w', encoding='utf-8') as f: | |
| f.write(sanitized_content) | |
| print(f"Processed: {filepath}") | |
| print("Sanitized all generated markdown files for MDX compatibility") | |
| EOF | |
| - name: Cleanup sonar Source | |
| run: | | |
| # Remove the cloned sonar source directory to prevent it from being | |
| # committed as a submodule (it contains a .git directory). | |
| rm -rf sonar-source | |
| echo "Cleaned up temporary files" | |
| - name: Check for Changes | |
| id: changes | |
| run: | | |
| # Only stage generated reference pages, not the hand-written | |
| # landing readme.md. | |
| git add docs/ai-agents/reference/sdk/ | |
| if git diff --cached --quiet; then | |
| echo "has_changes=false" >> $GITHUB_OUTPUT | |
| else | |
| echo "has_changes=true" >> $GITHUB_OUTPUT | |
| fi | |
| - name: Create Pull Request | |
| if: steps.changes.outputs.has_changes == 'true' | |
| uses: peter-evans/create-pull-request@c0f553fe549906ede9cf27b5156039d195d2ece0 # v8.1.0 | |
| with: | |
| token: ${{ steps.get-app-token.outputs.token }} | |
| commit-message: "docs: Update airbyte-agent-sdk API reference documentation (sonar@${{ steps.clone.outputs.sonar_sha }})" | |
| title: "docs: Update airbyte-agent-sdk API reference documentation (sonar@${{ steps.clone.outputs.sonar_sha }})" | |
| body: | | |
| This PR updates the airbyte-agent-sdk API reference documentation. | |
| **Source:** airbytehq/sonar @ `${{ steps.clone.outputs.sonar_sha }}` (main) | |
| The documentation was auto-generated using [pdoc3](https://github.com/pdoc3/pdoc) | |
| from the `airbyte_agent_sdk` package. The SDK tracks sonar main as the | |
| source of truth — see `.github/workflows/agent-sdk-docs-generate.yml`. | |
| --- | |
| *This PR was automatically generated by the [agent-sdk-docs-generate](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) workflow.* | |
| branch: docs/agent-sdk-api-reference | |
| base: master | |
| labels: | | |
| area/documentation | |
| auto-generated | |
| auto-merge | |
| delete-branch: true |