@@ -980,163 +980,6 @@ async fn shutdown_worker_not_retried() {
980980 assert_eq ! ( shutdown_call_count. load( Ordering :: Relaxed ) , 1 ) ;
981981}
982982
983- /// Reproduces the server-side race in temporalio/temporal#9545 where a poll completes naturally
984- /// (e.g. long-poll timeout) right as shutdown begins, causing the poller to re-poll AFTER the
985- /// ShutdownWorker RPC has already been processed. The server never flushes this second poll,
986- /// so without the 5s TEMP_FIX timeout the worker would hang for 60s.
987- ///
988- /// Sequence:
989- /// 1. Worker starts polling with graceful_poll_shutdown enabled
990- /// 2. initiate_shutdown() fires → shutdown token cancelled, ShutdownWorker RPC spawned
991- /// 3. First poll returns empty (natural timeout) — but this races with ShutdownWorker
992- /// 4. Poller re-polls (graceful mode: no select! against shutdown token)
993- /// 5. ShutdownWorker RPC completes — server flushes nothing (first poll already returned)
994- /// 6. Second poll hangs forever — server doesn't know about it
995- /// 7. TEMP_FIX: after 5s the graceful_interruptor cancels the hanging poll
996- ///
997- /// This test verifies shutdown completes within 10s (not 60s), proving the temp fix works.
998- /// Remove this test when temporalio/temporal#9545 is fully deployed.
999- #[ tokio:: test]
1000- async fn graceful_shutdown_race_temp_fix_prevents_60s_hang ( ) {
1001- use prost:: Message ;
1002- use std:: sync:: atomic:: AtomicUsize ;
1003- use temporalio_common:: protos:: temporal:: api:: {
1004- namespace:: v1:: { NamespaceInfo , namespace_info:: Capabilities } ,
1005- workflowservice:: v1:: DescribeNamespaceResponse ,
1006- } ;
1007- use tokio:: sync:: Notify ;
1008-
1009- fn grpc_ok_empty ( ) -> tonic:: codegen:: http:: Response < tonic:: body:: Body > {
1010- tonic:: codegen:: http:: Response :: builder ( )
1011- . header ( "content-type" , "application/grpc" )
1012- . header ( "grpc-status" , "0" )
1013- . body ( tonic:: body:: Body :: empty ( ) )
1014- . unwrap ( )
1015- }
1016- fn grpc_ok_proto ( msg : & impl Message ) -> tonic:: codegen:: http:: Response < tonic:: body:: Body > {
1017- let encoded = msg. encode_to_vec ( ) ;
1018- let mut buf = Vec :: with_capacity ( 5 + encoded. len ( ) ) ;
1019- buf. push ( 0 ) ;
1020- buf. extend_from_slice ( & ( encoded. len ( ) as u32 ) . to_be_bytes ( ) ) ;
1021- buf. extend_from_slice ( & encoded) ;
1022- tonic:: codegen:: http:: Response :: builder ( )
1023- . header ( "content-type" , "application/grpc" )
1024- . header ( "grpc-status" , "0" )
1025- . body ( tonic:: body:: Body :: new ( http_body_util:: Full :: new (
1026- bytes:: Bytes :: from ( buf) ,
1027- ) ) )
1028- . unwrap ( )
1029- }
1030-
1031- // Track poll count to distinguish first poll (returns empty) from re-polls (hang forever)
1032- let poll_count = Arc :: new ( AtomicUsize :: new ( 0 ) ) ;
1033- let poll_count_clone = poll_count. clone ( ) ;
1034- // Signal from initiate_shutdown (via shutdown_worker RPC) to release the first poll
1035- let shutdown_signal = Arc :: new ( Notify :: new ( ) ) ;
1036- let shutdown_signal_for_rpc = shutdown_signal. clone ( ) ;
1037- let shutdown_signal_for_poll = shutdown_signal. clone ( ) ;
1038-
1039- let fs = fake_server ( move |req| {
1040- let uri = req. uri ( ) . to_string ( ) ;
1041- let poll_count = poll_count_clone. clone ( ) ;
1042- let shutdown_signal_for_poll = shutdown_signal_for_poll. clone ( ) ;
1043- let shutdown_signal_for_rpc = shutdown_signal_for_rpc. clone ( ) ;
1044-
1045- if uri. contains ( "DescribeNamespace" ) {
1046- let resp = DescribeNamespaceResponse {
1047- namespace_info : Some ( NamespaceInfo {
1048- capabilities : Some ( Capabilities {
1049- worker_poll_complete_on_shutdown : true ,
1050- ..Capabilities :: default ( )
1051- } ) ,
1052- ..NamespaceInfo :: default ( )
1053- } ) ,
1054- ..DescribeNamespaceResponse :: default ( )
1055- } ;
1056- async move { grpc_ok_proto ( & resp) } . boxed ( )
1057- } else if uri. contains ( "Poll" ) {
1058- async move {
1059- let n = poll_count. fetch_add ( 1 , Ordering :: SeqCst ) ;
1060- if n == 0 {
1061- // First poll: wait for shutdown to start, then return empty.
1062- // This simulates the poll timing out naturally right as shutdown begins.
1063- shutdown_signal_for_poll. notified ( ) . await ;
1064- grpc_ok_empty ( )
1065- } else {
1066- // Re-poll after shutdown: hang forever. This is the race —
1067- // the server already processed ShutdownWorker and won't flush this poll.
1068- futures_util:: future:: pending ( ) . await
1069- }
1070- }
1071- . boxed ( )
1072- } else if uri. contains ( "ShutdownWorker" ) {
1073- // ShutdownWorker arrives — signal the first poll to return (simulating the race
1074- // where poll returns right as/after ShutdownWorker is processed).
1075- async move {
1076- shutdown_signal_for_rpc. notify_waiters ( ) ;
1077- grpc_ok_empty ( )
1078- }
1079- . boxed ( )
1080- } else {
1081- async { grpc_ok_empty ( ) } . boxed ( )
1082- }
1083- } )
1084- . await ;
1085-
1086- let mut opts = get_integ_server_options ( ) ;
1087- opts. target = format ! ( "http://localhost:{}" , fs. addr. port( ) )
1088- . parse :: < url:: Url > ( )
1089- . unwrap ( ) ;
1090- opts. set_skip_get_system_info ( true ) ;
1091- let connection = Connection :: connect ( opts) . await . unwrap ( ) ;
1092- let client_opts = temporalio_client:: ClientOptions :: new ( "ns" ) . build ( ) ;
1093- let client = temporalio_client:: Client :: new ( connection, client_opts) . unwrap ( ) ;
1094-
1095- let wf_type = "graceful_shutdown_race" ;
1096- let mut starter = CoreWfStarter :: new_with_overrides ( wf_type, None , Some ( client) ) ;
1097- let worker = starter. get_worker ( ) . await ;
1098-
1099- // Enable graceful poll shutdown via validate()
1100- worker. validate ( ) . await . unwrap ( ) ;
1101-
1102- // Start polling BEFORE initiating shutdown so the poll is in-flight.
1103- // poll_workflow_activation triggers LongPollBuffer to start, which spawns a poll task
1104- // that hits the fake server and blocks on the first poll (waiting for shutdown_signal).
1105- let poll_handle = tokio:: spawn ( {
1106- let w = worker. clone ( ) ;
1107- async move {
1108- // This will block until shutdown causes PollError::ShutDown
1109- let _ = w. poll_workflow_activation ( ) . await ;
1110- }
1111- } ) ;
1112- let act_handle = tokio:: spawn ( {
1113- let w = worker. clone ( ) ;
1114- async move {
1115- let _ = w. poll_activity_task ( ) . await ;
1116- }
1117- } ) ;
1118-
1119- // Give polls time to reach the fake server before initiating shutdown
1120- tokio:: time:: sleep ( Duration :: from_millis ( 500 ) ) . await ;
1121-
1122- // Shutdown should complete within 10s (5s TEMP_FIX + margin).
1123- // Without the temp fix, the re-poll hangs for 60s.
1124- let result = tokio:: time:: timeout ( Duration :: from_secs ( 10 ) , async {
1125- worker. shutdown ( ) . await ;
1126- } )
1127- . await ;
1128-
1129- assert ! (
1130- result. is_ok( ) ,
1131- "Shutdown should complete within 10s. If it hangs, the TEMP_FIX graceful poll \
1132- timeout is not working and the server race (temporal#9545) caused a 60s hang."
1133- ) ;
1134-
1135- let _ = poll_handle. await ;
1136- let _ = act_handle. await ;
1137- fs. shutdown ( ) . await ;
1138- }
1139-
1140983#[ test]
1141984fn test_default_build_id ( ) {
1142985 let o = WorkerOptions :: new ( "task_queue" ) . build ( ) ;
@@ -1153,7 +996,7 @@ impl ShutdownTimerActivityLoopWf {
1153996 #[ run]
1154997 async fn run ( ctx : & mut WorkflowContext < Self > ) -> WorkflowResult < ( ) > {
1155998 loop {
1156- ctx. timer ( Duration :: from_millis ( 100 ) ) . await ;
999+ ctx. timer ( Duration :: from_millis ( 10 ) ) . await ;
11571000 ctx. start_activity (
11581001 StdActivities :: no_op,
11591002 ( ) ,
0 commit comments