Skip to content

Commit dbc20ea

Browse files
j-mendezclaude
andcommitted
gottem-cli: --formats + --return-links flags (0.1.7 -> 0.1.8)
Surfaces the v0.1.7 server features (Format::Markdown/Html/Text/ Screenshot via content_by_format, return_links via response.links) on the CLI's `--remote` path: gottem fetch --remote \ --formats markdown,html,screenshot \ --return-links \ https://example.com - `--formats` accepts a comma-separated list; values are normalized to lowercase + trimmed before send. Unknown formats are dropped server-side (forward-compat). - `--return-links` toggles the sibling `links` response field (spider_service convention). - Output rendering: multi-format responses print one labelled section per format; single-format / legacy keeps the existing `content` output unchanged. Links print at the tail with a stderr header showing the count. - Local-mode fetch is unchanged; transform pipeline lives in gottem-cloud, so `--formats` is `--remote`-only for now. Verified live against api.gottem.dev: probe with the new flags lands a structured `{"error":"authentication required","code":"AUTH_REQUIRED"}` 401 — request shape accepted by the v0.1.7 server. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 9c2439f commit dbc20ea

3 files changed

Lines changed: 62 additions & 3 deletions

File tree

Cargo.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/gottem-cli/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "gottem-cli"
3-
version = "0.1.7"
3+
version = "0.1.8"
44
description = "gottem CLI: universal scraper that reliably gets the data."
55
readme = "README.md"
66
edition.workspace = true

crates/gottem-cli/src/main.rs

Lines changed: 60 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,25 @@ struct FetchArgs {
126126
/// gottem API key (`gtm_…`) for --remote. Falls back to $GOTTEM_API_KEY.
127127
#[arg(long, env = "GOTTEM_API_KEY", hide_env_values = true)]
128128
api_key: Option<String>,
129+
130+
/// Output formats to request from the hosted API (comma-separated).
131+
/// Each value maps onto a `gottem_core::Format`. Server runs
132+
/// `spider_transformations` after the orchestrator returns and packs
133+
/// one payload per format into `content_by_format` on the response.
134+
///
135+
/// Only honored with `--remote` today — local mode doesn't yet run the
136+
/// transform pipeline.
137+
///
138+
/// Valid values: `markdown`, `html`, `text`, `screenshot`.
139+
#[arg(long = "formats", value_delimiter = ',')]
140+
content_formats: Vec<String>,
141+
142+
/// Populate the `links` response field with absolute URLs scraped from
143+
/// the page's `<a href>` anchors (sorted + deduped). Mirrors
144+
/// spider_service's `return_page_links` — links sit beside the content,
145+
/// not inside `content_by_format`. `--remote` only.
146+
#[arg(long = "return-links")]
147+
return_links: bool,
129148
}
130149

131150
#[derive(ValueEnum, Clone, Copy, Debug)]
@@ -383,6 +402,22 @@ async fn run_fetch_remote(args: FetchArgs) -> Result<()> {
383402
if !args.routes.is_empty() {
384403
body["routes"] = serde_json::json!(args.routes);
385404
}
405+
if !args.content_formats.is_empty() {
406+
// Lowercase + trim per element — server parses against `Format` with
407+
// a lowercase-rename serde, and unknown values are silently dropped
408+
// (forward-compat). Sending them lowercased keeps the wire shape
409+
// canonical.
410+
let normalized: Vec<String> = args
411+
.content_formats
412+
.iter()
413+
.map(|f| f.trim().to_lowercase())
414+
.filter(|f| !f.is_empty())
415+
.collect();
416+
body["formats"] = serde_json::json!(normalized);
417+
}
418+
if args.return_links {
419+
body["return_links"] = serde_json::json!(true);
420+
}
386421

387422
let resp = http_client()
388423
.post(format!("{base}/scrape"))
@@ -412,7 +447,31 @@ async fn run_fetch_remote(args: FetchArgs) -> Result<()> {
412447
parsed["credits_charged"],
413448
);
414449
}
415-
println!("{}", parsed["content"].as_str().unwrap_or(""));
450+
// Multi-format response: print each format under a labelled
451+
// header so the caller can pipe one stream into a file with
452+
// `tee`/`sed`. Single-format / legacy responses keep their
453+
// current behaviour — just `content` to stdout.
454+
if let Some(by_format) = parsed.get("content_by_format").and_then(|v| v.as_object()) {
455+
for (fmt, value) in by_format {
456+
println!("--- {fmt} ---");
457+
println!("{}", value.as_str().unwrap_or(""));
458+
}
459+
} else {
460+
println!("{}", parsed["content"].as_str().unwrap_or(""));
461+
}
462+
// Links sit alongside the content payloads (spider_service
463+
// convention); emit them after when present so piping `--format
464+
// content` to a file gets the URLs at the tail.
465+
if let Some(links) = parsed.get("links").and_then(|v| v.as_array()) {
466+
if !links.is_empty() {
467+
eprintln!("--- links ({}) ---", links.len());
468+
for link in links {
469+
if let Some(s) = link.as_str() {
470+
println!("{s}");
471+
}
472+
}
473+
}
474+
}
416475
}
417476
}
418477
Ok(())

0 commit comments

Comments
 (0)