Skip to content

Commit 65a3f59

Browse files
crowecawcawclaude
andcommitted
Redesign docs site with clean layout, sidebar TOC, and selector reference
Replaces the pandoc-generated dark theme with a standalone light theme using a two-column layout with sticky sidebar navigation. Adds usage section with example agent prompt and comprehensive selector documentation covering roles, attributes, combinators, and nth matching. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 245a7ad commit 65a3f59

3 files changed

Lines changed: 420 additions & 134 deletions

File tree

.github/workflows/docs.yml

Lines changed: 2 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -23,19 +23,11 @@ jobs:
2323
steps:
2424
- uses: actions/checkout@v4
2525

26-
- name: Install pandoc
27-
run: sudo apt-get install -y pandoc
28-
29-
- name: Build HTML from README
26+
- name: Build site
3027
run: |
3128
mkdir -p _site
32-
pandoc README.md \
33-
--standalone \
34-
--template docs/template.html \
35-
--metadata title="agent-desktop" \
36-
-o _site/index.html
29+
cp docs/index.html _site/index.html
3730
cp docs/style.css _site/style.css
38-
cp docs/highlight.js _site/highlight.js
3931
4032
- uses: actions/configure-pages@v5
4133

docs/index.html

Lines changed: 210 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,210 @@
1+
<!DOCTYPE html>
2+
<html lang="en">
3+
<head>
4+
<meta charset="utf-8">
5+
<meta name="viewport" content="width=device-width, initial-scale=1">
6+
<title>agent-desktop — Desktop automation CLI for AI agents</title>
7+
<meta name="description" content="Desktop automation CLI for AI agents. Observe and interact with any UI via accessibility APIs.">
8+
<meta property="og:title" content="agent-desktop">
9+
<meta property="og:description" content="Desktop automation CLI for AI agents. Observe and interact with any UI via accessibility APIs.">
10+
<meta property="og:type" content="website">
11+
<link rel="preconnect" href="https://fonts.googleapis.com">
12+
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
13+
<link href="https://fonts.googleapis.com/css2?family=Inter+Tight:wght@400;500;600&family=JetBrains+Mono:wght@400;500&display=swap" rel="stylesheet">
14+
<link rel="stylesheet" href="style.css">
15+
</head>
16+
<body>
17+
18+
<div class="shell">
19+
20+
<aside class="toc">
21+
<div class="toc-group">
22+
<h5>Overview</h5>
23+
<a href="#top" class="active">Introduction</a>
24+
<a href="#install">Install</a>
25+
26+
<a href="#usage">Usage</a>
27+
</div>
28+
<div class="toc-group">
29+
<h5>Reference</h5>
30+
<a href="#example">Example</a>
31+
<a href="#how">How it works</a>
32+
<a href="#commands">Commands</a>
33+
<a href="#selectors">Selectors</a>
34+
</div>
35+
<div class="toc-group">
36+
<h5>Platform</h5>
37+
<a href="#platforms">Platforms</a>
38+
<a href="#license">License</a>
39+
</div>
40+
</aside>
41+
42+
<main id="top">
43+
44+
<div class="lede">
45+
<h1>Observe and interact with any UI, <em>from the command line.</em></h1>
46+
<p class="sub">
47+
A desktop automation CLI designed for AI agents. Reads the OS accessibility tree,
48+
targets elements with CSS-like selectors, and exposes click, type, scroll, and screenshot
49+
as shell commands. macOS, Linux, and Windows. Native Rust, single binary.
50+
</p>
51+
<div class="meta-row">
52+
<span class="chip">macOS · Linux · Windows</span>
53+
54+
<span class="chip">MIT licensed</span>
55+
</div>
56+
</div>
57+
58+
<section id="install">
59+
<h2>Install</h2>
60+
<pre class="code"><code><span class="c"># via cargo</span>
61+
cargo install agent-desktop
62+
63+
<span class="c"># verify</span>
64+
agent-desktop --version</code></pre>
65+
</section>
66+
67+
<section id="usage">
68+
<h2>Usage</h2>
69+
<p>Add the following to your agent's system prompt or instructions:</p>
70+
<div class="prompt-block">
71+
<div class="prompt-label">Prompt</div>
72+
<div class="prompt-body">You have access to <code>agent-desktop</code>, a CLI for desktop automation via accessibility APIs. Run <code>agent-desktop --help</code> to see available commands. Use it to observe, click, type, scroll, and screenshot any application on the user's desktop. Start by observing the screen to understand what's visible, then interact with elements using their IDs or CSS-like selectors.</div>
73+
</div>
74+
</section>
75+
76+
<section id="example">
77+
<h2>Example</h2>
78+
<pre class="code"><code><span class="c"># List all running apps</span>
79+
agent-desktop observe
80+
81+
<span class="c"># Get the accessibility tree for a specific app</span>
82+
agent-desktop observe --app Safari
83+
84+
<span class="c"># Filter with CSS-like selectors</span>
85+
agent-desktop observe --app Safari --query 'button[name="OK"]'
86+
87+
<span class="c"># Interact</span>
88+
agent-desktop click --app Safari --query 'button[name="OK"]'
89+
agent-desktop type --text "hello world"
90+
agent-desktop screenshot --output /tmp/screen.png</code></pre>
91+
</section>
92+
93+
<section id="how">
94+
<h2>How it works</h2>
95+
<ol>
96+
<li><code>observe</code> — query the accessibility tree, get structured element data back.</li>
97+
<li>Use element IDs or selectors to click, type, scroll, or send a keystroke.</li>
98+
<li>Re-observe after each action to get updated state.</li>
99+
</ol>
100+
</section>
101+
102+
<section id="commands">
103+
<h2>Commands</h2>
104+
<div class="cmd-table">
105+
<div class="cmd-row"><code>observe</code><div class="desc">List apps, read the accessibility tree, filter with selectors, show role distribution.</div></div>
106+
<div class="cmd-row"><code>click</code><div class="desc">By element, selector, or absolute coordinates.</div></div>
107+
<div class="cmd-row"><code>type</code><div class="desc">At cursor, or into a specific element via <code>--query</code>.</div></div>
108+
<div class="cmd-row"><code>scroll</code><div class="desc">Direction, amount, target element.</div></div>
109+
<div class="cmd-row"><code>key</code><div class="desc">Single key or combo, e.g. <code>cmd+n</code>.</div></div>
110+
<div class="cmd-row"><code>focus</code><div class="desc">Bring an app to the foreground.</div></div>
111+
<div class="cmd-row"><code>read</code><div class="desc">Element text or value; also the system clipboard.</div></div>
112+
<div class="cmd-row"><code>wait</code><div class="desc">Block until a selector resolves.</div></div>
113+
<div class="cmd-row"><code>interact</code><div class="desc">Invoke a native accessibility action directly.</div></div>
114+
<div class="cmd-row"><code>screenshot</code><div class="desc">Write a PNG to disk.</div></div>
115+
</div>
116+
</section>
117+
118+
<section id="selectors">
119+
<h2>Selectors</h2>
120+
<p>The <code>--query</code> flag accepts CSS-like selectors for targeting elements in the accessibility tree.</p>
121+
122+
<h3>By role</h3>
123+
<pre class="code"><code>agent-desktop observe --app Safari --query 'button'
124+
agent-desktop observe --app Safari --query 'text_field'
125+
agent-desktop observe --app Safari --query 'menu_item'</code></pre>
126+
127+
<h3>By attribute</h3>
128+
<p>Supported attributes: <code>name</code>, <code>value</code>, <code>description</code>, <code>role</code>.</p>
129+
<div class="cmd-table">
130+
<div class="cmd-row"><code>[name="Submit"]</code><div class="desc">Exact match on name.</div></div>
131+
<div class="cmd-row"><code>[name*="addr"]</code><div class="desc">Substring match (case-insensitive).</div></div>
132+
<div class="cmd-row"><code>[name^="addr"]</code><div class="desc">Starts-with match (case-insensitive).</div></div>
133+
<div class="cmd-row"><code>[value="foo"]</code><div class="desc">Match by value attribute.</div></div>
134+
</div>
135+
136+
<h3>Combinators</h3>
137+
<div class="cmd-table" style="margin-bottom: 16px;">
138+
<div class="cmd-row"><code>toolbar > text_field</code><div class="desc">Direct child — text_field must be an immediate child of toolbar.</div></div>
139+
<div class="cmd-row"><code>toolbar text_field</code><div class="desc">Descendant — text_field anywhere inside toolbar.</div></div>
140+
</div>
141+
142+
<h3>Nth matching</h3>
143+
<pre class="code"><code><span class="c"># Click the 2nd button (1-based index)</span>
144+
agent-desktop click --app Safari --query 'button:nth(2)'</code></pre>
145+
146+
<h3>Combined</h3>
147+
<pre class="code"><code><span class="c"># Text field named "Address" inside a toolbar</span>
148+
agent-desktop observe --app Safari --query 'toolbar > text_field[name*="Address"]'
149+
150+
<span class="c"># 3rd menu item with "File" in the name</span>
151+
agent-desktop click --app Finder --query 'menu_item[name^="File"]:nth(3)'</code></pre>
152+
</section>
153+
154+
<section id="platforms">
155+
<h2>Platforms</h2>
156+
<div class="platforms">
157+
<div class="plat">
158+
<div class="pname">macOS</div>
159+
<div class="parch">arm64 · x64</div>
160+
<div class="papi">Accessibility API: <span>AXUIElement</span></div>
161+
</div>
162+
<div class="plat">
163+
<div class="pname">Linux</div>
164+
<div class="parch">x64</div>
165+
<div class="papi">Accessibility API: <span>AT-SPI2 via D-Bus</span></div>
166+
</div>
167+
<div class="plat">
168+
<div class="pname">Windows</div>
169+
<div class="parch">x64</div>
170+
<div class="papi">Accessibility API: <span>UI Automation</span></div>
171+
</div>
172+
</div>
173+
</section>
174+
175+
<section id="license">
176+
<h2>License</h2>
177+
<p>MIT. Source on <a href="https://github.com/crowecawcaw/agent-desktop">GitHub</a>.</p>
178+
</section>
179+
180+
</main>
181+
</div>
182+
183+
<footer>
184+
<span>agent-desktop</span>
185+
<span class="sep">·</span>
186+
<a href="https://github.com/crowecawcaw/agent-desktop">GitHub</a>
187+
<span class="spacer"></span>
188+
<span>MIT</span>
189+
</footer>
190+
191+
<script>
192+
// TOC active-section tracking
193+
const tocLinks = Array.from(document.querySelectorAll('.toc a'));
194+
const sections = tocLinks
195+
.map(a => document.querySelector(a.getAttribute('href')))
196+
.filter(Boolean);
197+
const byId = new Map(sections.map(s => [s.id, tocLinks.find(a => a.getAttribute('href') === '#' + s.id)]));
198+
const obs = new IntersectionObserver((entries) => {
199+
entries.forEach(e => {
200+
if (e.isIntersecting) {
201+
tocLinks.forEach(a => a.classList.remove('active'));
202+
const link = byId.get(e.target.id);
203+
if (link) link.classList.add('active');
204+
}
205+
});
206+
}, { rootMargin: '-40% 0px -55% 0px' });
207+
sections.forEach(s => obs.observe(s));
208+
</script>
209+
</body>
210+
</html>

0 commit comments

Comments
 (0)