Skip to content

Commit b703709

Browse files
authored
chore: add tests for processing request payload (#688)
### Description Correction of comments related to PR #683 Add new tests for `http_crawler` different forms of `payload` ### Testing The test checks that the `payload` on the client side matches the data received by the server. Also added verification that different types of `payload` are correctly handled on the client side and correctly recognized by the server with appropriate headers ### Checklist - [x] CI passed
1 parent 5827b93 commit b703709

File tree

3 files changed

+94
-30
lines changed

3 files changed

+94
-30
lines changed

src/crawlee/_request.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -138,9 +138,13 @@ class BaseRequestData(BaseModel):
138138
payload: Annotated[
139139
HttpPayload | None,
140140
BeforeValidator(lambda v: v.encode() if isinstance(v, str) else v),
141-
PlainSerializer(lambda v: v.decode() if isinstance(v, bytes) else None),
141+
PlainSerializer(lambda v: v.decode() if isinstance(v, bytes) else v),
142142
] = None
143-
"""HTTP request payload."""
143+
"""HTTP request payload.
144+
145+
TODO: Re-check the need for `Validator` and `Serializer` once the issue is resolved.
146+
https://github.com/apify/crawlee-python/issues/94
147+
"""
144148

145149
user_data: Annotated[
146150
dict[str, JsonSerializable], # Internally, the model contains `UserData`, this is just for convenience

tests/unit/_memory_storage_client/test_request_queue_client.py

-1
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,6 @@ async def test_request_state_serialization(request_queue_client: RequestQueueCli
105105
got_request = await request_queue_client.get_request(request.id)
106106

107107
assert request == got_request
108-
assert request.payload == got_request.payload
109108

110109

111110
async def test_add_record(request_queue_client: RequestQueueClient) -> None:

tests/unit/http_crawler/test_http_crawler.py

+88-27
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,18 @@
2121
from crawlee.http_crawler._http_crawling_context import HttpCrawlingContext
2222

2323

24+
# Payload, e.g. data for a form submission.
25+
PAYLOAD = {
26+
'custname': 'John Doe',
27+
'custtel': '1234567890',
28+
'custemail': '[email protected]',
29+
'size': 'large',
30+
'topping': '["bacon", "cheese", "mushroom"]',
31+
'delivery': '13:00',
32+
'comments': 'Please ring the doorbell upon arrival.',
33+
}
34+
35+
2436
@pytest.fixture
2537
async def mock_request_handler() -> Callable[[HttpCrawlingContext], Awaitable[None]] | AsyncMock:
2638
return AsyncMock()
@@ -214,21 +226,9 @@ async def test_http_status_statistics(crawler: HttpCrawler, server: respx.MockRo
214226
[CurlImpersonateHttpClient, HttpxHttpClient],
215227
ids=['curl', 'httpx'],
216228
)
217-
async def test_sending_payload(http_client_class: type[BaseHttpClient]) -> None:
229+
async def test_sending_payload_as_raw_data(http_client_class: type[BaseHttpClient]) -> None:
218230
http_client = http_client_class()
219231
crawler = HttpCrawler(http_client=http_client)
220-
221-
# Payload, e.g. data from a form submission.
222-
payload = {
223-
'custname': 'John Doe',
224-
'custtel': '1234567890',
225-
'custemail': '[email protected]',
226-
'size': 'large',
227-
'topping': '["bacon", "cheese", "mushroom"]',
228-
'delivery': '13:00',
229-
'comments': 'Please ring the doorbell upon arrival.',
230-
}
231-
232232
responses = []
233233

234234
@crawler.router.default_handler
@@ -237,35 +237,100 @@ async def request_handler(context: HttpCrawlingContext) -> None:
237237
# The httpbin.org/post endpoint returns the provided payload in the response.
238238
responses.append(response)
239239

240+
encoded_payload = urlencode(PAYLOAD).encode()
240241
request = Request.from_url(
241242
url='https://httpbin.org/post',
242243
method='POST',
243-
payload=urlencode(payload).encode(),
244+
payload=encoded_payload,
244245
)
245246

246247
await crawler.run([request])
247248

248-
# The request handler should be called once.
249-
assert len(responses) == 1, 'The request handler should be called once.'
249+
assert len(responses) == 1, 'Request handler should be called exactly once.'
250+
assert responses[0]['data'].encode() == encoded_payload, 'Response payload data does not match the sent payload.'
250251

251252
# The reconstructed payload data should match the original payload. We have to flatten the values, because
252253
# parse_qs returns a list of values for each key.
253-
response_data = {
254-
k: v[0] if len(v) == 1 else v for k, v in parse_qs(responses[0]['data'].strip("b'").strip("'")).items()
255-
}
254+
response_data = {k: v[0] if len(v) == 1 else v for k, v in parse_qs(responses[0]['data']).items()}
255+
assert response_data == PAYLOAD, 'The reconstructed payload data does not match the sent payload.'
256256

257-
assert response_data == payload, 'The reconstructed payload data should match the original payload.'
257+
assert responses[0]['json'] is None, 'Response JSON data should be empty when only raw data is sent.'
258+
assert responses[0]['form'] == {}, 'Response form data should be empty when only raw data is sent.'
258259

259260

260261
@pytest.mark.parametrize(
261262
'http_client_class',
262263
[CurlImpersonateHttpClient, HttpxHttpClient],
263264
ids=['curl', 'httpx'],
264265
)
265-
async def test_sending_url_query_params(http_client_class: type[BaseHttpClient]) -> None:
266+
async def test_sending_payload_as_form_data(http_client_class: type[BaseHttpClient]) -> None:
266267
http_client = http_client_class()
267268
crawler = HttpCrawler(http_client=http_client)
269+
responses = []
268270

271+
@crawler.router.default_handler
272+
async def request_handler(context: HttpCrawlingContext) -> None:
273+
response = json.loads(context.http_response.read())
274+
# The httpbin.org/post endpoint returns the provided payload in the response.
275+
responses.append(response)
276+
277+
request = Request.from_url(
278+
url='https://httpbin.org/post',
279+
method='POST',
280+
headers={'content-type': 'application/x-www-form-urlencoded'},
281+
payload=urlencode(PAYLOAD).encode(),
282+
)
283+
284+
await crawler.run([request])
285+
286+
assert len(responses) == 1, 'Request handler should be called exactly once.'
287+
assert responses[0]['form'] == PAYLOAD, 'Form data in response does not match the sent payload.'
288+
289+
assert responses[0]['json'] is None, 'Response JSON data should be empty when only form data is sent.'
290+
assert responses[0]['data'] == '', 'Response raw data should be empty when only form data is sent.'
291+
292+
293+
@pytest.mark.parametrize(
294+
'http_client_class',
295+
[CurlImpersonateHttpClient, HttpxHttpClient],
296+
ids=['curl', 'httpx'],
297+
)
298+
async def test_sending_payload_as_json(http_client_class: type[BaseHttpClient]) -> None:
299+
http_client = http_client_class()
300+
crawler = HttpCrawler(http_client=http_client)
301+
responses = []
302+
303+
@crawler.router.default_handler
304+
async def request_handler(context: HttpCrawlingContext) -> None:
305+
response = json.loads(context.http_response.read())
306+
# The httpbin.org/post endpoint returns the provided payload in the response.
307+
responses.append(response)
308+
309+
json_payload = json.dumps(PAYLOAD).encode()
310+
request = Request.from_url(
311+
url='https://httpbin.org/post',
312+
method='POST',
313+
payload=json_payload,
314+
headers={'content-type': 'application/json'},
315+
)
316+
317+
await crawler.run([request])
318+
319+
assert len(responses) == 1, 'Request handler should be called exactly once.'
320+
assert responses[0]['data'].encode() == json_payload, 'Response raw JSON data does not match the sent payload.'
321+
assert responses[0]['json'] == PAYLOAD, 'Response JSON data does not match the sent payload.'
322+
323+
assert responses[0]['form'] == {}, 'Response form data should be empty when only JSON data is sent.'
324+
325+
326+
@pytest.mark.parametrize(
327+
'http_client_class',
328+
[CurlImpersonateHttpClient, HttpxHttpClient],
329+
ids=['curl', 'httpx'],
330+
)
331+
async def test_sending_url_query_params(http_client_class: type[BaseHttpClient]) -> None:
332+
http_client = http_client_class()
333+
crawler = HttpCrawler(http_client=http_client)
269334
responses = []
270335

271336
@crawler.router.default_handler
@@ -280,11 +345,7 @@ async def request_handler(context: HttpCrawlingContext) -> None:
280345

281346
await crawler.run([request])
282347

283-
# The request handler should be called once.
284-
assert len(responses) == 1, 'The request handler should be called once.'
348+
assert len(responses) == 1, 'Request handler should be called exactly once.'
285349

286-
# Validate the response query parameters.
287350
response_args = responses[0]['args']
288-
assert (
289-
response_args == query_params
290-
), 'The reconstructed query parameters should match the original query parameters.'
351+
assert response_args == query_params, 'Reconstructed query params must match the original query params.'

0 commit comments

Comments
 (0)