Skip to content

Commit 462f747

Browse files
committed
Attempt to un-hang tests
1 parent 0716c20 commit 462f747

File tree

1 file changed

+11
-4
lines changed

1 file changed

+11
-4
lines changed

tests/unit/crawlers/_basic/test_basic_crawler.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import asyncio
55
import json
66
import logging
7+
import multiprocessing
78
import os
89
import re
910
import sys
@@ -1868,6 +1869,12 @@ def _process_run_crawlers(crawler_inputs: list[_CrawlerInput], storage_dir: str)
18681869
]
18691870

18701871

1872+
# Use the "spawn" start method to avoid inheriting the parent's tokio runtime thread state
1873+
# (created by pyo3-async-runtimes) via "fork", which causes the child to hang on exit on Linux.
1874+
# See pyo3-async-runtimes#40 / #64.
1875+
_spawn_context = multiprocessing.get_context('spawn')
1876+
1877+
18711878
async def test_crawler_state_persistence(tmp_path: Path) -> None:
18721879
"""Test that crawler statistics and state persist and are loaded correctly.
18731880
@@ -1877,7 +1884,7 @@ async def test_crawler_state_persistence(tmp_path: Path) -> None:
18771884
storage_client=FileSystemStorageClient(), configuration=Configuration(storage_dir=str(tmp_path))
18781885
)
18791886

1880-
with ProcessPoolExecutor() as executor:
1887+
with ProcessPoolExecutor(mp_context=_spawn_context) as executor:
18811888
# Crawl 2 requests in the first run and automatically persist the state.
18821889
first_run_state = executor.submit(
18831890
_process_run_crawlers,
@@ -1890,7 +1897,7 @@ async def test_crawler_state_persistence(tmp_path: Path) -> None:
18901897
assert state.get('urls') == ['https://a.placeholder.com', 'https://b.placeholder.com']
18911898

18921899
# Do not reuse the executor to simulate a fresh process to avoid modified class attributes.
1893-
with ProcessPoolExecutor() as executor:
1900+
with ProcessPoolExecutor(mp_context=_spawn_context) as executor:
18941901
# Crawl 1 additional requests in the second run, but use previously automatically persisted state.
18951902
second_run_state = executor.submit(
18961903
_process_run_crawlers,
@@ -1926,7 +1933,7 @@ async def test_crawler_state_persistence_2_crawlers_with_migration(tmp_path: Pat
19261933
storage_client=FileSystemStorageClient(), configuration=Configuration(storage_dir=str(tmp_path))
19271934
)
19281935

1929-
with ProcessPoolExecutor() as executor:
1936+
with ProcessPoolExecutor(mp_context=_spawn_context) as executor:
19301937
# Run 2 crawler, each crawl 1 request in and automatically persist the state.
19311938
first_run_states = executor.submit(
19321939
_process_run_crawlers,
@@ -1944,7 +1951,7 @@ async def test_crawler_state_persistence_2_crawlers_with_migration(tmp_path: Pat
19441951
state_1 = await state_kvs.get_value(f'{BasicCrawler._CRAWLEE_STATE_KEY}_1')
19451952
assert state_1.get('urls') == ['https://c.placeholder.com']
19461953

1947-
with ProcessPoolExecutor() as executor:
1954+
with ProcessPoolExecutor(mp_context=_spawn_context) as executor:
19481955
# Run 2 crawler, each crawl 1 request in and automatically persist the state.
19491956
second_run_states = executor.submit(
19501957
_process_run_crawlers,

0 commit comments

Comments
 (0)