44import asyncio
55import json
66import logging
7+ import multiprocessing
78import os
89import re
910import sys
@@ -1868,6 +1869,12 @@ def _process_run_crawlers(crawler_inputs: list[_CrawlerInput], storage_dir: str)
18681869 ]
18691870
18701871
1872+ # Use the "spawn" start method to avoid inheriting the parent's tokio runtime thread state
1873+ # (created by pyo3-async-runtimes) via "fork", which causes the child to hang on exit on Linux.
1874+ # See pyo3-async-runtimes#40 / #64.
1875+ _spawn_context = multiprocessing .get_context ('spawn' )
1876+
1877+
18711878async def test_crawler_state_persistence (tmp_path : Path ) -> None :
18721879 """Test that crawler statistics and state persist and are loaded correctly.
18731880
@@ -1877,7 +1884,7 @@ async def test_crawler_state_persistence(tmp_path: Path) -> None:
18771884 storage_client = FileSystemStorageClient (), configuration = Configuration (storage_dir = str (tmp_path ))
18781885 )
18791886
1880- with ProcessPoolExecutor () as executor :
1887+ with ProcessPoolExecutor (mp_context = _spawn_context ) as executor :
18811888 # Crawl 2 requests in the first run and automatically persist the state.
18821889 first_run_state = executor .submit (
18831890 _process_run_crawlers ,
@@ -1890,7 +1897,7 @@ async def test_crawler_state_persistence(tmp_path: Path) -> None:
18901897 assert state .get ('urls' ) == ['https://a.placeholder.com' , 'https://b.placeholder.com' ]
18911898
18921899 # Do not reuse the executor to simulate a fresh process to avoid modified class attributes.
1893- with ProcessPoolExecutor () as executor :
1900+ with ProcessPoolExecutor (mp_context = _spawn_context ) as executor :
18941901 # Crawl 1 additional requests in the second run, but use previously automatically persisted state.
18951902 second_run_state = executor .submit (
18961903 _process_run_crawlers ,
@@ -1926,7 +1933,7 @@ async def test_crawler_state_persistence_2_crawlers_with_migration(tmp_path: Pat
19261933 storage_client = FileSystemStorageClient (), configuration = Configuration (storage_dir = str (tmp_path ))
19271934 )
19281935
1929- with ProcessPoolExecutor () as executor :
1936+ with ProcessPoolExecutor (mp_context = _spawn_context ) as executor :
19301937 # Run 2 crawler, each crawl 1 request in and automatically persist the state.
19311938 first_run_states = executor .submit (
19321939 _process_run_crawlers ,
@@ -1944,7 +1951,7 @@ async def test_crawler_state_persistence_2_crawlers_with_migration(tmp_path: Pat
19441951 state_1 = await state_kvs .get_value (f'{ BasicCrawler ._CRAWLEE_STATE_KEY } _1' )
19451952 assert state_1 .get ('urls' ) == ['https://c.placeholder.com' ]
19461953
1947- with ProcessPoolExecutor () as executor :
1954+ with ProcessPoolExecutor (mp_context = _spawn_context ) as executor :
19481955 # Run 2 crawler, each crawl 1 request in and automatically persist the state.
19491956 second_run_states = executor .submit (
19501957 _process_run_crawlers ,
0 commit comments