|
| 1 | +""" |
| 2 | +Preprocess hook for mkdocs-llmstxt. |
| 3 | +
|
| 4 | +Strips TypeDoc boilerplate from HTML before markdown conversion to shrink |
| 5 | +llms-full-content.txt below the WebFetch ingestion threshold. All |
| 6 | +transformations preserve unique documentation content; only repetitive |
| 7 | +scaffolding and intra-file link wrappers are removed or collapsed. |
| 8 | +
|
| 9 | +Targeted artifacts (verified against the CI-pinned build of main): |
| 10 | +
|
| 11 | + 1. <hr> — 103 occurrences; methods already |
| 12 | + separated by <h3> headings. |
| 13 | + 2. Nested "(): void" return blocks — 26 occurrences; TypeDoc boilerplate |
| 14 | + for `() => void` return types. |
| 15 | + 3. Parameters / Type Parameters tables — 119 total; collapsed to bullet |
| 16 | + lists. Preserves param name, type, |
| 17 | + and description. Works for any row |
| 18 | + count (single- and multi-row). |
| 19 | + 4. "Prerequisites: Initialize the SDK" — 16 occurrences; duplicates the |
| 20 | + Authentication docs and adds no |
| 21 | + new information per-method. |
| 22 | + 5. Intra-docs <a> cross-reference links — 600+ occurrences; `<a>` tags whose |
| 23 | + only child is a <code> element are |
| 24 | + type cross-references. The link |
| 25 | + text (the type name) is preserved; |
| 26 | + the href is dropped because every |
| 27 | + target is already in the same |
| 28 | + aggregated llms-full-content.txt. |
| 29 | +
|
| 30 | +Wired up in mkdocs.yml under the large llmstxt plugin block: |
| 31 | +
|
| 32 | + plugins: |
| 33 | + - llmstxt: |
| 34 | + full_output: "llms-full-content.txt" |
| 35 | + preprocess: "scripts/llms-preprocess.py" |
| 36 | + ... |
| 37 | +
|
| 38 | +Contract (from mkdocs_llmstxt._internal.preprocess): |
| 39 | + preprocess(soup: BeautifulSoup, output: str) -> None |
| 40 | + - soup has already passed through `autoclean` (permalinks, images stripped) |
| 41 | + - output is the per-page markdown output path (not the aggregated llms*.txt) |
| 42 | +""" |
| 43 | + |
| 44 | +from __future__ import annotations |
| 45 | + |
| 46 | +from typing import TYPE_CHECKING |
| 47 | + |
| 48 | +from bs4 import NavigableString |
| 49 | + |
| 50 | +if TYPE_CHECKING: |
| 51 | + from bs4 import BeautifulSoup, Tag |
| 52 | + |
| 53 | + |
| 54 | +_PREREQUISITES_PREFIX = "Prerequisites: Initialize the SDK first" |
| 55 | +_PARAM_HEADINGS = ("Parameters", "Type Parameters") |
| 56 | + |
| 57 | + |
| 58 | +def preprocess(soup: BeautifulSoup, _output: str) -> None: |
| 59 | + """Entry point called by mkdocs-llmstxt for each page included in any output. |
| 60 | +
|
| 61 | + The plugin calls this once per included page per llmstxt block; we apply |
| 62 | + the same transformations to every page — the shrink is non-destructive |
| 63 | + and benefits both the coded-action-apps and the large llms-full-content |
| 64 | + outputs. |
| 65 | +
|
| 66 | + The second parameter (the per-page markdown output path) is required by |
| 67 | + the plugin's calling convention. We don't need it here — the underscore |
| 68 | + prefix marks it intentionally unused. |
| 69 | + """ |
| 70 | + _remove_hrs(soup) |
| 71 | + _collapse_nested_void_returns(soup) |
| 72 | + _collapse_param_tables(soup) |
| 73 | + _strip_prerequisites_paragraphs(soup) |
| 74 | + _unwrap_type_cross_reference_links(soup) |
| 75 | + |
| 76 | + |
| 77 | +def _remove_hrs(soup: BeautifulSoup) -> None: |
| 78 | + """Strip <hr> elements — TypeDoc places them between methods that already |
| 79 | + have <h3> headings, so the horizontal rule is pure visual redundancy.""" |
| 80 | + for hr in soup.find_all("hr"): |
| 81 | + hr.decompose() |
| 82 | + |
| 83 | + |
| 84 | +def _collapse_nested_void_returns(soup: BeautifulSoup) -> None: |
| 85 | + """Remove the nested "(): void" return block TypeDoc emits for methods |
| 86 | + whose return type is a cleanup function (`() => void`). |
| 87 | +
|
| 88 | + Input pattern: |
| 89 | + <h4>Returns</h4> |
| 90 | + <p>Cleanup function to remove the handler</p> ← keep |
| 91 | + <blockquote>(): void</blockquote> ← remove |
| 92 | + <h5>Returns</h5> ← remove |
| 93 | + <p>void</p> ← remove |
| 94 | + <h4>Example</h4> ← next section |
| 95 | +
|
| 96 | + Anchoring on the inner <h5>Returns</h5> is the most reliable signal; |
| 97 | + the sibling <blockquote> and <p>void</p> always appear with it. |
| 98 | + """ |
| 99 | + for h5 in soup.find_all("h5"): |
| 100 | + if _heading_text(h5) != "Returns": |
| 101 | + continue |
| 102 | + prev_sib = h5.find_previous_sibling() |
| 103 | + next_sib = h5.find_next_sibling() |
| 104 | + if ( |
| 105 | + prev_sib is not None |
| 106 | + and prev_sib.name == "blockquote" |
| 107 | + and next_sib is not None |
| 108 | + and next_sib.name == "p" |
| 109 | + and next_sib.get_text(strip=True) == "void" |
| 110 | + ): |
| 111 | + prev_sib.decompose() |
| 112 | + next_sib.decompose() |
| 113 | + h5.decompose() |
| 114 | + |
| 115 | + |
| 116 | +def _collapse_param_tables(soup: BeautifulSoup) -> None: |
| 117 | + """Replace Parameters and Type Parameters tables with a bullet list, |
| 118 | + regardless of row count. Table scaffolding (header row, separator row, |
| 119 | + and pipe-delimited data rows) is expensive relative to a `<li>` per row. |
| 120 | +
|
| 121 | + Rendered shape (Format B — chosen in prior review): |
| 122 | + - `options?`: `T` — Query options including folderId… |
| 123 | + - `folderId`: `number` — The ID of the organization unit |
| 124 | + - `cursor?`: `string` — Pagination cursor from a previous response |
| 125 | + """ |
| 126 | + for heading in soup.find_all("h4"): |
| 127 | + table = _find_param_table(heading) |
| 128 | + if table is None: |
| 129 | + continue |
| 130 | + kind = _heading_text(heading) |
| 131 | + ul = _build_param_list(soup, table, kind) |
| 132 | + # Skip replacement if no rows rendered — leaves the original table |
| 133 | + # intact so malformed input never silently disappears. |
| 134 | + if ul.find("li") is not None: |
| 135 | + table.replace_with(ul) |
| 136 | + |
| 137 | + |
| 138 | +def _find_param_table(heading: Tag) -> Tag | None: |
| 139 | + """Return the <table> sibling if `heading` introduces a Parameters or |
| 140 | + Type Parameters table with a populated <tbody>; otherwise None.""" |
| 141 | + if _heading_text(heading) not in _PARAM_HEADINGS: |
| 142 | + return None |
| 143 | + table = heading.find_next_sibling() |
| 144 | + if table is None or table.name != "table" or table.find("tbody") is None: |
| 145 | + return None |
| 146 | + return table |
| 147 | + |
| 148 | + |
| 149 | +def _build_param_list(soup: BeautifulSoup, table: Tag, kind: str) -> Tag: |
| 150 | + """Build a <ul> bullet list with one <li> per row in `table`'s <tbody>. |
| 151 | + Rows with fewer than two cells are skipped (defensive — shouldn't happen |
| 152 | + in valid TypeDoc output).""" |
| 153 | + ul = soup.new_tag("ul") |
| 154 | + rows = table.find("tbody").find_all("tr", recursive=False) |
| 155 | + for row in rows: |
| 156 | + cells = row.find_all(["td", "th"], recursive=False) |
| 157 | + if len(cells) >= 2: |
| 158 | + ul.append(_render_param_row(soup, cells, kind=kind)) |
| 159 | + return ul |
| 160 | + |
| 161 | + |
| 162 | +def _render_param_row( |
| 163 | + soup: BeautifulSoup, |
| 164 | + cells: list[Tag], |
| 165 | + kind: str, |
| 166 | +) -> Tag: |
| 167 | + """Render one parameter row as a single `<li>` element. |
| 168 | +
|
| 169 | + Format B: |
| 170 | + <li>{name}: {type} — {description}</li> |
| 171 | + → md: - `options?`: `T` — Query options including folderId… |
| 172 | +
|
| 173 | + For Type Parameters (two columns: type param, default type), emits: |
| 174 | + <li>{type param} = {default}</li> |
| 175 | + skipping the default when absent or identical to the type param. |
| 176 | + """ |
| 177 | + li = soup.new_tag("li") |
| 178 | + # Column 0 — parameter name (or type parameter). Move children (rather than |
| 179 | + # copy text) so inner <code>/<a> formatting is preserved. |
| 180 | + _move_children(li, cells[0]) |
| 181 | + if kind == "Type Parameters": |
| 182 | + _append_type_parameter_default(li, cells) |
| 183 | + else: |
| 184 | + _append_parameter_type_and_description(li, cells) |
| 185 | + return li |
| 186 | + |
| 187 | + |
| 188 | +def _append_type_parameter_default(li: Tag, cells: list[Tag]) -> None: |
| 189 | + """Append `= default` for a Type Parameters row, skipping the default |
| 190 | + when it's absent, a dash placeholder, or identical to the type param.""" |
| 191 | + default_text = cells[1].get_text(strip=True) |
| 192 | + param_text = cells[0].get_text(strip=True) |
| 193 | + if default_text and default_text != "-" and default_text != param_text: |
| 194 | + li.append(NavigableString(" = ")) |
| 195 | + _move_children(li, cells[1]) |
| 196 | + |
| 197 | + |
| 198 | +def _append_parameter_type_and_description(li: Tag, cells: list[Tag]) -> None: |
| 199 | + """Append `: type` and optional ` — description` for a Parameters row.""" |
| 200 | + li.append(NavigableString(": ")) |
| 201 | + _move_children(li, cells[1]) |
| 202 | + if len(cells) >= 3 and cells[2].get_text(strip=True): |
| 203 | + li.append(NavigableString(" — ")) |
| 204 | + _move_children(li, cells[2]) |
| 205 | + |
| 206 | + |
| 207 | +def _move_children(destination: Tag, source: Tag) -> None: |
| 208 | + """Move all child nodes from `source` into `destination`, preserving order. |
| 209 | +
|
| 210 | + BeautifulSoup's `.append()` re-parents a node (removes it from the source's |
| 211 | + `.contents`). Popping from `source.contents[0]` in a while loop keeps the |
| 212 | + iteration correct despite that mutation — we repeatedly grab the current |
| 213 | + first child until there are none left. |
| 214 | + """ |
| 215 | + while source.contents: |
| 216 | + destination.append(source.contents[0]) |
| 217 | + |
| 218 | + |
| 219 | +def _strip_prerequisites_paragraphs(soup: BeautifulSoup) -> None: |
| 220 | + """Remove the repeated "Prerequisites: Initialize the SDK first…" blurb. |
| 221 | +
|
| 222 | + The authentication and getting-started pages cover SDK initialization in |
| 223 | + depth; repeating the line on every service method adds ~120 bytes per |
| 224 | + occurrence with no new information. |
| 225 | + """ |
| 226 | + for p in soup.find_all("p"): |
| 227 | + if p.get_text(strip=True).startswith(_PREREQUISITES_PREFIX): |
| 228 | + p.decompose() |
| 229 | + |
| 230 | + |
| 231 | +def _unwrap_type_cross_reference_links(soup: BeautifulSoup) -> None: |
| 232 | + """Unwrap <a> tags whose only meaningful content is a <code> element. |
| 233 | +
|
| 234 | + TypeDoc wraps every type cross-reference in an <a> — e.g. |
| 235 | + <a href="../PaginatedResponse/"><code>PaginatedResponse</code></a> |
| 236 | + After markdown conversion these become `[\`PaginatedResponse\`](../PaginatedResponse/)` |
| 237 | + which bloats the file by ~30 bytes per link. Since every link target is |
| 238 | + already in the same aggregated llms-full-content.txt, the URL adds no |
| 239 | + value to an LLM consumer — unwrapping leaves the type name intact while |
| 240 | + dropping the `[...](...)` wrapper. |
| 241 | +
|
| 242 | + Safety: only <a> tags whose non-whitespace content is a single <code> |
| 243 | + element are unwrapped. Prose links like |
| 244 | + <a href="...">Getting Started</a> |
| 245 | + <a href="...">UiPath Conversational Agents Guide</a> |
| 246 | + are preserved because their inner content is a plain text node, not a |
| 247 | + <code> tag. |
| 248 | + """ |
| 249 | + for a in soup.find_all("a"): |
| 250 | + if _has_single_code_child(a): |
| 251 | + a.unwrap() |
| 252 | + |
| 253 | + |
| 254 | +def _has_single_code_child(tag: Tag) -> bool: |
| 255 | + """Return True when `tag`'s only meaningful child is a single <code>.""" |
| 256 | + code_seen = False |
| 257 | + for child in tag.children: |
| 258 | + if isinstance(child, NavigableString): |
| 259 | + if child.strip(): |
| 260 | + return False # has real text content alongside — not just code |
| 261 | + continue |
| 262 | + if child.name == "code" and not code_seen: |
| 263 | + code_seen = True |
| 264 | + continue |
| 265 | + return False # a second element child, or something other than code |
| 266 | + return code_seen |
| 267 | + |
| 268 | + |
| 269 | +def _heading_text(element: Tag) -> str: |
| 270 | + """Get heading text without the trailing permalink marker. |
| 271 | +
|
| 272 | + `autoclean` strips the permalink <a>, but the ¶ character can linger |
| 273 | + on some themes — defensive normalization keeps this script resilient |
| 274 | + to mkdocs-material theme changes. |
| 275 | + """ |
| 276 | + return element.get_text().replace("¶", "").strip() |
0 commit comments