The Crawl endpoint discovers and extracts content from multiple pages starting
from a given URL.
API Reference: https://spider.cloud/docs/api#crawl
# Always use a limit to control credit usage
response = SpiderCloud.crawl( 'https://example.com', limit: 5 )
response.result.each do | page |
puts "#{ page.url }: #{ page.content&.length } chars"
end
options = SpiderCloud::CrawlOptions.build do
limit 5 # max pages to crawl
depth 2 # max link depth
return_format :markdown
readability true
end
response = SpiderCloud.crawl( 'https://example.com', options )
| Option |
Type |
Default |
Description |
limit |
Integer |
0 |
Max pages to crawl (0 = unlimited) |
depth |
Integer |
25 |
Max crawl depth |
return_format |
Symbol |
:raw |
Output format |
request |
Symbol |
:smart |
Request type |
| Option |
Type |
Description |
subdomains |
Boolean |
Include subdomains |
tld |
Boolean |
Include TLD variations |
external_domains |
Array |
External domains to include (["*"] for all) |
redirect_policy |
Symbol |
:loose, :strict, :none |
| Option |
Type |
Description |
blacklist |
Array |
Paths to exclude (regex supported) |
whitelist |
Array |
Paths to include only |
budget |
Hash |
Path-based page limits |
link_rewrite |
Hash |
URL rewrite rules |
Control how many pages to crawl per path:
options = SpiderCloud::CrawlOptions.build do
limit 100
budget( {
'*' => 5, # default: 5 pages per path
'/docs/' => 50, # up to 50 pages under /docs/
'/blog/' => 20 # up to 20 pages under /blog/
} )
end
| Option |
Type |
Description |
sitemap |
Boolean |
Use sitemap for discovery |
sitemap_only |
Boolean |
Only crawl sitemap URLs |
sitemap_path |
String |
Custom sitemap path |
Content Extraction
| Option |
Type |
Description |
readability |
Boolean |
Safari Reader Mode |
root_selector |
String |
CSS selector for content |
exclude_selector |
String |
CSS selector to ignore |
css_extraction_map |
Hash |
Structured data extraction |
filter_main_only |
Boolean |
Main content only |
full_resources |
Boolean |
Download images, videos |
| Option |
Type |
Description |
return_json_data |
Boolean |
Return SSR JSON data |
return_headers |
Boolean |
Include HTTP headers |
return_cookies |
Boolean |
Include cookies |
return_page_links |
Boolean |
Include discovered links |
return_embeddings |
Boolean |
Include embeddings |
metadata |
Boolean |
Collect page metadata |
| Option |
Type |
Description |
request_timeout |
Integer |
Timeout per page (5-255 seconds) |
cache |
Boolean |
Enable caching |
concurrency_limit |
Integer |
Concurrent requests |
delay |
Integer |
Delay between requests (ms) |
| Option |
Type |
Description |
max_credits_per_page |
Integer |
Max credits per page |
max_credits_allowed |
Integer |
Total credit limit |
crawl_timeout |
Hash |
Max crawl duration {seconds:, nanoseconds:} |
options = SpiderCloud::CrawlOptions.build do
limit 100
webhooks do
destination 'https://your-server.com/webhook'
on_credits_depleted true
on_find true
end
end
response = SpiderCloud.crawl( 'https://example.com', limit: 5 )
response.result.success? # => true
response.result.count # => 5
response.result.urls # => ["https://...", ...]
response.result.contents # => ["...", ...]
response.result.total_cost # => 0.0002
# Iterate over pages
response.result.each do | page |
page.url # => "https://..."
page.content # => "..."
page.status # => 200
page.costs.total_cost # => 0.00004
end
# Filter by success
response.result.succeeded # => [successful pages]
response.result.failed # => [failed pages]
options = SpiderCloud::CrawlOptions.build do
limit 50
whitelist [ '/docs/' ]
return_format :markdown
readability true
end
response = SpiderCloud.crawl( 'https://example.com', options )
options = SpiderCloud::CrawlOptions.build do
limit 20
depth 2
end
response = SpiderCloud.crawl( 'https://example.com', options )
options = SpiderCloud::CrawlOptions.build do
limit 50
blacklist [ '/admin/', '/private/', '/api/' ]
end
options = SpiderCloud::CrawlOptions.build do
limit 100
sitemap true
sitemap_only true
end
options = SpiderCloud::CrawlOptions.build do
limit 10
automation_scripts( {
'/login' => [
{ 'Fill' => { 'selector' => '#email', 'value' => 'user@example.com' } },
{ 'Click' => 'button[type=submit]' },
{ 'WaitForNavigation' => true }
]
} )
end