21
21
from crawlee .http_crawler ._http_crawling_context import HttpCrawlingContext
22
22
23
23
24
+ # Payload, e.g. data for a form submission.
25
+ PAYLOAD = {
26
+ 'custname' : 'John Doe' ,
27
+ 'custtel' : '1234567890' ,
28
+
29
+ 'size' : 'large' ,
30
+ 'topping' : '["bacon", "cheese", "mushroom"]' ,
31
+ 'delivery' : '13:00' ,
32
+ 'comments' : 'Please ring the doorbell upon arrival.' ,
33
+ }
34
+
35
+
24
36
@pytest .fixture
25
37
async def mock_request_handler () -> Callable [[HttpCrawlingContext ], Awaitable [None ]] | AsyncMock :
26
38
return AsyncMock ()
@@ -214,21 +226,9 @@ async def test_http_status_statistics(crawler: HttpCrawler, server: respx.MockRo
214
226
[CurlImpersonateHttpClient , HttpxHttpClient ],
215
227
ids = ['curl' , 'httpx' ],
216
228
)
217
- async def test_sending_payload (http_client_class : type [BaseHttpClient ]) -> None :
229
+ async def test_sending_payload_as_raw_data (http_client_class : type [BaseHttpClient ]) -> None :
218
230
http_client = http_client_class ()
219
231
crawler = HttpCrawler (http_client = http_client )
220
-
221
- # Payload, e.g. data from a form submission.
222
- payload = {
223
- 'custname' : 'John Doe' ,
224
- 'custtel' : '1234567890' ,
225
-
226
- 'size' : 'large' ,
227
- 'topping' : '["bacon", "cheese", "mushroom"]' ,
228
- 'delivery' : '13:00' ,
229
- 'comments' : 'Please ring the doorbell upon arrival.' ,
230
- }
231
-
232
232
responses = []
233
233
234
234
@crawler .router .default_handler
@@ -237,35 +237,100 @@ async def request_handler(context: HttpCrawlingContext) -> None:
237
237
# The httpbin.org/post endpoint returns the provided payload in the response.
238
238
responses .append (response )
239
239
240
+ encoded_payload = urlencode (PAYLOAD ).encode ()
240
241
request = Request .from_url (
241
242
url = 'https://httpbin.org/post' ,
242
243
method = 'POST' ,
243
- payload = urlencode ( payload ). encode () ,
244
+ payload = encoded_payload ,
244
245
)
245
246
246
247
await crawler .run ([request ])
247
248
248
- # The request handler should be called once.
249
- assert len ( responses ) == 1 , 'The request handler should be called once .'
249
+ assert len ( responses ) == 1 , 'Request handler should be called exactly once.'
250
+ assert responses [ 0 ][ 'data' ]. encode ( ) == encoded_payload , 'Response payload data does not match the sent payload .'
250
251
251
252
# The reconstructed payload data should match the original payload. We have to flatten the values, because
252
253
# parse_qs returns a list of values for each key.
253
- response_data = {
254
- k : v [0 ] if len (v ) == 1 else v for k , v in parse_qs (responses [0 ]['data' ].strip ("b'" ).strip ("'" )).items ()
255
- }
254
+ response_data = {k : v [0 ] if len (v ) == 1 else v for k , v in parse_qs (responses [0 ]['data' ]).items ()}
255
+ assert response_data == PAYLOAD , 'The reconstructed payload data does not match the sent payload.'
256
256
257
- assert response_data == payload , 'The reconstructed payload data should match the original payload.'
257
+ assert responses [0 ]['json' ] is None , 'Response JSON data should be empty when only raw data is sent.'
258
+ assert responses [0 ]['form' ] == {}, 'Response form data should be empty when only raw data is sent.'
258
259
259
260
260
261
@pytest .mark .parametrize (
261
262
'http_client_class' ,
262
263
[CurlImpersonateHttpClient , HttpxHttpClient ],
263
264
ids = ['curl' , 'httpx' ],
264
265
)
265
- async def test_sending_url_query_params (http_client_class : type [BaseHttpClient ]) -> None :
266
+ async def test_sending_payload_as_form_data (http_client_class : type [BaseHttpClient ]) -> None :
266
267
http_client = http_client_class ()
267
268
crawler = HttpCrawler (http_client = http_client )
269
+ responses = []
268
270
271
+ @crawler .router .default_handler
272
+ async def request_handler (context : HttpCrawlingContext ) -> None :
273
+ response = json .loads (context .http_response .read ())
274
+ # The httpbin.org/post endpoint returns the provided payload in the response.
275
+ responses .append (response )
276
+
277
+ request = Request .from_url (
278
+ url = 'https://httpbin.org/post' ,
279
+ method = 'POST' ,
280
+ headers = {'content-type' : 'application/x-www-form-urlencoded' },
281
+ payload = urlencode (PAYLOAD ).encode (),
282
+ )
283
+
284
+ await crawler .run ([request ])
285
+
286
+ assert len (responses ) == 1 , 'Request handler should be called exactly once.'
287
+ assert responses [0 ]['form' ] == PAYLOAD , 'Form data in response does not match the sent payload.'
288
+
289
+ assert responses [0 ]['json' ] is None , 'Response JSON data should be empty when only form data is sent.'
290
+ assert responses [0 ]['data' ] == '' , 'Response raw data should be empty when only form data is sent.'
291
+
292
+
293
+ @pytest .mark .parametrize (
294
+ 'http_client_class' ,
295
+ [CurlImpersonateHttpClient , HttpxHttpClient ],
296
+ ids = ['curl' , 'httpx' ],
297
+ )
298
+ async def test_sending_payload_as_json (http_client_class : type [BaseHttpClient ]) -> None :
299
+ http_client = http_client_class ()
300
+ crawler = HttpCrawler (http_client = http_client )
301
+ responses = []
302
+
303
+ @crawler .router .default_handler
304
+ async def request_handler (context : HttpCrawlingContext ) -> None :
305
+ response = json .loads (context .http_response .read ())
306
+ # The httpbin.org/post endpoint returns the provided payload in the response.
307
+ responses .append (response )
308
+
309
+ json_payload = json .dumps (PAYLOAD ).encode ()
310
+ request = Request .from_url (
311
+ url = 'https://httpbin.org/post' ,
312
+ method = 'POST' ,
313
+ payload = json_payload ,
314
+ headers = {'content-type' : 'application/json' },
315
+ )
316
+
317
+ await crawler .run ([request ])
318
+
319
+ assert len (responses ) == 1 , 'Request handler should be called exactly once.'
320
+ assert responses [0 ]['data' ].encode () == json_payload , 'Response raw JSON data does not match the sent payload.'
321
+ assert responses [0 ]['json' ] == PAYLOAD , 'Response JSON data does not match the sent payload.'
322
+
323
+ assert responses [0 ]['form' ] == {}, 'Response form data should be empty when only JSON data is sent.'
324
+
325
+
326
+ @pytest .mark .parametrize (
327
+ 'http_client_class' ,
328
+ [CurlImpersonateHttpClient , HttpxHttpClient ],
329
+ ids = ['curl' , 'httpx' ],
330
+ )
331
+ async def test_sending_url_query_params (http_client_class : type [BaseHttpClient ]) -> None :
332
+ http_client = http_client_class ()
333
+ crawler = HttpCrawler (http_client = http_client )
269
334
responses = []
270
335
271
336
@crawler .router .default_handler
@@ -280,11 +345,7 @@ async def request_handler(context: HttpCrawlingContext) -> None:
280
345
281
346
await crawler .run ([request ])
282
347
283
- # The request handler should be called once.
284
- assert len (responses ) == 1 , 'The request handler should be called once.'
348
+ assert len (responses ) == 1 , 'Request handler should be called exactly once.'
285
349
286
- # Validate the response query parameters.
287
350
response_args = responses [0 ]['args' ]
288
- assert (
289
- response_args == query_params
290
- ), 'The reconstructed query parameters should match the original query parameters.'
351
+ assert response_args == query_params , 'Reconstructed query params must match the original query params.'
0 commit comments