@@ -13,12 +13,15 @@ use std::fmt::Debug;
1313use std:: io:: IsTerminal ;
1414use std:: process:: ExitStatus ;
1515use std:: process:: Stdio ;
16+ use std:: time:: Duration ;
1617use std:: time:: Instant ;
1718use tokio:: io:: DuplexStream ;
1819use tokio:: sync:: oneshot;
1920use tokio:: task:: JoinHandle ;
2021
2122use aho_corasick:: AhoCorasick ;
23+ use backoff:: backoff:: Backoff ;
24+ use backoff:: ExponentialBackoff ;
2225use camino:: Utf8Path ;
2326use camino:: Utf8PathBuf ;
2427use miette:: miette;
@@ -268,6 +271,8 @@ pub struct Ghci {
268271 search_paths : ShowPaths ,
269272 /// Tasks running `async:` shell commands in the background.
270273 command_handles : Vec < JoinHandle < miette:: Result < ExitStatus > > > ,
274+ /// Monotonic counter for generating unique sync barrier nonces.
275+ sync_nonce : u64 ,
271276}
272277
273278impl Debug for Ghci {
@@ -398,6 +403,7 @@ impl Ghci {
398403 search_paths : Default :: default ( ) ,
399404 } ,
400405 command_handles,
406+ sync_nonce : 0 ,
401407 } )
402408 }
403409
@@ -837,20 +843,141 @@ impl Ghci {
837843 Ok ( ( ) )
838844 }
839845
846+ /// Interrupt the running GHCi session.
847+ ///
848+ /// On `Err`, the GHCi session may have been killed (e.g. because the sync
849+ /// barrier could not restore the prompt). Callers MUST treat an error here
850+ /// as a session-died event and route through the normal restart path
851+ /// rather than propagating it as fatal. See [`Ghci::sync_barrier`] for details.
840852 #[ instrument( skip_all, level = "debug" ) ]
841853 async fn send_sigint ( & mut self ) -> miette:: Result < ( ) > {
842854 let start_instant = Instant :: now ( ) ;
843- signal:: killpg ( self . process_group_id , Signal :: SIGINT )
844- . into_diagnostic ( )
845- . wrap_err ( "Failed to send `Ctrl-C` (`SIGINT`) to ghci session" ) ?;
846- self . stdout
847- . prompt (
848- crate :: incremental_reader:: FindAt :: Anywhere ,
849- // Ignore compilation messages.
850- & mut Default :: default ( ) ,
851- )
852- . await ?;
853- tracing:: debug!( "Interrupted ghci in {:.2?}" , start_instant. elapsed( ) ) ;
855+
856+ // Phase 1: Send SIGINT repeatedly until we find a clean, uninterrupted prompt.
857+ //
858+ // An interrupted reload can cause interleaved output between the GHCi prompt and
859+ // compilation output (due to GHC bug where the logging thread isn't stopped on
860+ // async exception — see `runParPipelines` in GHC's Driver/Make.hs). We send
861+ // SIGINT with exponential backoff until we see a prompt that isn't garbled.
862+ let mut backoff = ExponentialBackoff {
863+ initial_interval : Duration :: from_millis ( 5 ) ,
864+ max_interval : Duration :: from_millis ( 100 ) ,
865+ multiplier : 1.25 ,
866+ max_elapsed_time : Some ( Duration :: from_secs ( 10 ) ) ,
867+ ..Default :: default ( )
868+ } ;
869+
870+ let mut sigint_count: usize = 0 ;
871+ loop {
872+ let Some ( delay) = backoff. next_backoff ( ) else {
873+ return Err ( miette ! (
874+ "Timed out waiting for GHCi to respond to SIGINT after {:.2?}" ,
875+ start_instant. elapsed( )
876+ ) ) ;
877+ } ;
878+
879+ sigint_count += 1 ;
880+ signal:: killpg ( self . process_group_id , Signal :: SIGINT )
881+ . into_diagnostic ( )
882+ . wrap_err ( "Failed to send `Ctrl-C` (`SIGINT`) to ghci session" ) ?;
883+ tracing:: debug!( count = sigint_count, "Sent SIGINT" ) ;
884+
885+ let found = self . stdout . buffer_and_drain_prompts ( delay) . await ?;
886+ if found > 0 {
887+ tracing:: debug!(
888+ found,
889+ elapsed = ?start_instant. elapsed( ) ,
890+ "Found prompt after SIGINT"
891+ ) ;
892+ break ;
893+ }
894+ }
895+
896+ // If we only sent 1 SIGINT, then there cannot be extra prompts waiting to be read from the
897+ // buffer; only do the sync barrier process if we sent multiple SIGINTs.
898+ if sigint_count > 1 {
899+ self . sync_barrier ( ) . await ?;
900+ }
901+
902+ tracing:: info!( "Interrupted ghci in {:.2?}" , start_instant. elapsed( ) ) ;
903+ Ok ( ( ) )
904+ }
905+
906+ /// Sync barrier: deterministically consume all stale prompts from the pipe.
907+ ///
908+ /// We rely on the fact that GHCi processes input commands one at a time, in order. When we send
909+ /// a command to GHCi, we read its output up until the next prompt and know that the output
910+ /// we've read matches the command we sent. This is important because we parse GHCi output in
911+ /// several places (e.g. compilation errors go to the `error_log`, `:show paths` and `:show
912+ /// targets` are used to inform module additions/removals/reloads, etc.), so if we're parsing
913+ /// output from a different command, we'll Have Problems.
914+ ///
915+ /// When we're hitting Ctrl-C repeatedly (in case of a user input prompt interleaved with
916+ /// compilation output in GHCi's stdout stream), we don't know how many times GHCi will print a
917+ /// prompt that we can read.
918+ ///
919+ /// Therefore, we _change_ the prompt and read until _that_ specific prompt shows up in the
920+ /// output, using a unique (to the `ghci` process) and different prompt each time we call this
921+ /// method. This ensures we consume all remaining stale output, without having to wait until we
922+ /// "think it's safe" and wasting the user's time after GHCi is done writing.
923+ #[ instrument( skip_all, level = "debug" ) ]
924+ async fn sync_barrier ( & mut self ) -> miette:: Result < ( ) > {
925+ self . sync_nonce += 1 ;
926+ let nonce = self . sync_nonce ;
927+ let sync_marker = format ! ( "~~~GHCIWATCH-SYNC-{nonce}~~~" ) ;
928+
929+ // Set the prompt to our sync marker.
930+ self . stdin
931+ . write_set_prompt ( & sync_marker)
932+ . await
933+ . wrap_err ( "Failed to write sync command to ghci stdin" ) ?;
934+
935+ // From here until the prompt is restored, any failure leaves the session
936+ // unable to match `PROMPT` again. Restoring in-band after a failed read
937+ // is not safe (the buffer is in an unknown state, and confirming the
938+ // restore would itself depend on prompt matching), so on any error we
939+ // SIGKILL the process and let the manager restart the session.
940+ let sync_timeout = Duration :: from_secs ( 3 ) ;
941+ let read =
942+ tokio:: time:: timeout ( sync_timeout, self . stdout . read_until_marker ( & sync_marker) ) . await ;
943+ let result = match read {
944+ Ok ( Ok ( _ghci_output) ) => self
945+ . stdin
946+ . set_prompt (
947+ & mut self . stdout ,
948+ PROMPT ,
949+ crate :: incremental_reader:: FindAt :: LineStart ,
950+ // We don't expect to see any compilation here, so we pass a stub
951+ // `CompilationLog` and discard it.
952+ & mut Default :: default ( ) ,
953+ )
954+ . await
955+ . wrap_err ( "Failed to restore prompt after sync barrier" ) ,
956+ Ok ( Err ( e) ) => Err ( e) . wrap_err ( "Failed to read until sync marker" ) ,
957+ Err ( _elapsed) => Err ( miette ! (
958+ "Timed out waiting for GHCi sync marker after {sync_timeout:?}"
959+ ) ) ,
960+ } ;
961+
962+ if let Err ( e) = result {
963+ // Kill the process directly rather than going through `restart_sender`.
964+ // `restart_sender` is the graceful-shutdown path: `GhciProcess` consumes it
965+ // and intentionally suppresses `exited_sender`, so the manager would never
966+ // learn ghci died. We need the wait future in `GhciProcess::run` to win the
967+ // select so `exited_sender` fires and `wait_and_restart_runtime` takes over.
968+ if let Err ( kill_err) = signal:: killpg ( self . process_group_id , Signal :: SIGKILL )
969+ . into_diagnostic ( )
970+ . wrap_err ( "Failed to send `SIGKILL` to ghci session" )
971+ {
972+ tracing:: error!(
973+ error = %kill_err,
974+ "Failed to SIGKILL ghci after sync_barrier failure" ,
975+ ) ;
976+ }
977+ return Err ( e) . wrap_err (
978+ "ghci sync barrier failed; killed the session because the prompt could not be restored" ,
979+ ) ;
980+ }
854981 Ok ( ( ) )
855982 }
856983
0 commit comments