11package os
22
33import (
4- "context"
54 "fmt"
65 "io"
76 "math"
@@ -118,15 +117,15 @@ func (e *execer) Exec(command scootexecer.Command) (scootexecer.Process, error)
118117
119118 proc := & process {cmd : cmd , wg : & wg , ats : AbortTimeoutSec , LogTags : command .LogTags }
120119 if e .memCap > 0 {
121- go e .monitorMem (proc , command .MemCh )
120+ go e .monitorMem (proc , command .MemCh , command . Stderr )
122121 }
123122
124123 return proc , nil
125124}
126125
127126// Periodically check to make sure memory constraints are respected,
128127// and clean up after ourselves when the process has completed
129- func (e * execer ) monitorMem (p * process , memCh chan scootexecer.ProcessStatus ) {
128+ func (e * execer ) monitorMem (p * process , memCh chan scootexecer.ProcessStatus , stderr io. Writer ) {
130129 pid := p .cmd .Process .Pid
131130 pgid , err := syscall .Getpgid (pid )
132131 if err != nil {
@@ -168,6 +167,8 @@ func (e *execer) monitorMem(p *process, memCh chan scootexecer.ProcessStatus) {
168167 Error : msg ,
169168 ExitCode : errors .HighInitialMemoryUtilizationExitCode ,
170169 }
170+ // log the process snapshot in worker log, as well as task stderr log
171+ e .pw .LogProcs (p , log .ErrorLevel , stderr )
171172 p .mutex .Unlock ()
172173 e .memCapKill (p , mem , memCh )
173174 return
@@ -212,6 +213,7 @@ func (e *execer) monitorMem(p *process, memCh chan scootexecer.ProcessStatus) {
212213 Error : msg ,
213214 ExitCode : 1 ,
214215 }
216+ e .pw .LogProcs (p , log .ErrorLevel , stderr )
215217 p .mutex .Unlock ()
216218 e .memCapKill (p , mem , memCh )
217219 return
@@ -230,7 +232,7 @@ func (e *execer) monitorMem(p *process, memCh chan scootexecer.ProcessStatus) {
230232 "jobID" : p .JobID ,
231233 "taskID" : p .TaskID ,
232234 }).Infof ("Memory utilization increased to %d%%, pid: %d" , int (memUsagePct * 100 ), pid )
233- debugProcesses ( p )
235+ e . pw . LogProcs ( p , log . DebugLevel , nil )
234236 for memUsagePct > reportThresholds [thresholdsIdx ] {
235237 thresholdsIdx ++
236238 }
@@ -265,26 +267,6 @@ func (e *execer) memCapKill(p *process, mem scootexecer.Memory, memCh chan scoot
265267 e .stat .Gauge (stats .WorkerMemory ).Update (int64 (postKillMem ))
266268}
267269
268- // debugProcesses logs a snapshot of the current processes at debug level.
269- func debugProcesses (p * process ) {
270- // Debug log output with timeout since it seems CombinedOutput() sometimes fails to return.
271- if log .IsLevelEnabled (log .DebugLevel ) {
272- ctx , cancel := context .WithTimeout (context .Background (), 2 * time .Second )
273- ps , err := exec .CommandContext (ctx , "ps" , "-u" , os .Getenv ("USER" ), "-opid,sess,ppid,pgid,rss,args" ).CombinedOutput ()
274- log .WithFields (
275- log.Fields {
276- "pid" : p .cmd .Process .Pid ,
277- "ps" : string (ps ),
278- "err" : err ,
279- "errCtx" : ctx .Err (),
280- "tag" : p .Tag ,
281- "jobID" : p .JobID ,
282- "taskID" : p .TaskID ,
283- }).Debugf ("ps after increased memory utilization for pid %d" , p .cmd .Process .Pid )
284- cancel ()
285- }
286- }
287-
288270// Kill process along with all child processes, assuming no child processes called setpgid
289271func cleanupProcs (pgid int ) (err error ) {
290272 log .WithFields (
0 commit comments