Skip to content

Fix: request /crawl with stream: true issue #1066 #1074

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions deploy/docker/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -470,9 +470,20 @@ async def crawl(
):
"""
Crawl a list of URLs and return the results as JSON.
For streaming responses, use /crawl/stream endpoint.
"""
if not crawl_request.urls:
raise HTTPException(400, "At least one URL required")

# Check whether it is a redirection for a streaming request
crawler_config = CrawlerRunConfig.load(crawl_request.crawler_config)
if crawler_config.stream:
return RedirectResponse(
url="/crawl/stream",
status_code=307,
headers={"Content-Type": "application/json"}
)

res = await handle_crawl_request(
urls=crawl_request.urls,
browser_config=crawl_request.browser_config,
Expand Down
19 changes: 19 additions & 0 deletions deploy/docker/static/playground/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -503,6 +503,25 @@ <h2 class="font-medium text-accent">🔥 Stress Test</h2>
} else {
// For /crawl endpoints, show the advanced config
advConfig.classList.remove('hidden');

// Update the stream configuration in CodeMirror
let currentConfig = cm.getValue();
if (endpoint === 'crawl_stream') {
// Make sure that stream=True
if (!currentConfig.includes('stream=')) {
currentConfig = currentConfig.replace('CrawlerRunConfig(', 'CrawlerRunConfig(\n stream=True,');
} else {
currentConfig = currentConfig.replace(/stream=False/, 'stream=True');
}
} else {
// Make sure that stream=False
if (!currentConfig.includes('stream=')) {
currentConfig = currentConfig.replace('CrawlerRunConfig(', 'CrawlerRunConfig(\n stream=False,');
} else {
currentConfig = currentConfig.replace(/stream=True/, 'stream=False');
}
}
cm.setValue(currentConfig);
}
});

Expand Down
36 changes: 36 additions & 0 deletions tests/docker/test_server_requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,42 @@ async def test_simple_crawl_single_url(self, async_client: httpx.AsyncClient):
# We don't specify a markdown generator in this test, so don't make assumptions about markdown field
# It might be null, missing, or populated depending on the server's default behavior

async def test_crawl_with_stream_redirects(self, async_client: httpx.AsyncClient):
"""Test that when requesting /crawl, parameters with stream=True will be automatically redirected to /crawl/stream."""
payload = {
"urls": [SIMPLE_HTML_URL],
"browser_config": {
"type": "BrowserConfig",
"params": {
"headless": True,
}
},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"stream": True, # Set stream to True to trigger redirection
"screenshot": False,
"cache_mode": CacheMode.BYPASS.value
}
}
}

# Send a request to the /crawl endpoint
response = await async_client.post("/crawl", json=payload, follow_redirects=True)

assert response.status_code == 200
assert response.headers["content-type"] == "application/x-ndjson"
assert response.headers.get("x-stream-status") == "active"

results = await process_streaming_response(response)

assert len(results) == 1
result = results[0]
await assert_crawl_result_structure(result)
assert result["success"] is True
assert result["url"] == SIMPLE_HTML_URL
assert "<h1>Herman Melville - Moby-Dick</h1>" in result["html"]

async def test_simple_crawl_single_url_streaming(self, async_client: httpx.AsyncClient):
"""Test /crawl/stream with a single URL and simple config values."""
payload = {
Expand Down