-
Notifications
You must be signed in to change notification settings - Fork 304
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Handle processes whose main thread has exited #376
base: main
Are you sure you want to change the base?
Changes from 8 commits
dfaf278
2835340
eb4e909
068322f
b97ae39
0da64dc
01392e6
48698d5
5905d6a
1eeb764
78b9208
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,12 +24,17 @@ import ( | |
"go.opentelemetry.io/ebpf-profiler/stringutil" | ||
) | ||
|
||
// GetMappings returns this error when no mappings can be extracted. | ||
var ErrNoMappings = errors.New("no mappings") | ||
|
||
// systemProcess provides an implementation of the Process interface for a | ||
// process that is currently running on this machine. | ||
type systemProcess struct { | ||
pid libpf.PID | ||
tid libpf.PID | ||
|
||
remoteMemory remotememory.RemoteMemory | ||
mainThreadExit bool | ||
remoteMemory remotememory.RemoteMemory | ||
|
||
fileToMapping map[string]*Mapping | ||
} | ||
|
@@ -53,9 +58,10 @@ func init() { | |
} | ||
|
||
// New returns an object with Process interface accessing it | ||
func New(pid libpf.PID) Process { | ||
func New(pid, tid libpf.PID) Process { | ||
return &systemProcess{ | ||
pid: pid, | ||
tid: tid, | ||
remoteMemory: remotememory.NewProcessVirtualMemory(pid), | ||
} | ||
} | ||
|
@@ -165,9 +171,8 @@ func parseMappings(mapsFile io.Reader) ([]Mapping, uint32, error) { | |
path = VdsoPathName | ||
device = 0 | ||
inode = vdsoInode | ||
} else if path != "" { | ||
// Ignore [vsyscall] and similar executable kernel | ||
// pages we don't care about | ||
} else { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No semantic change, I just inlined the logic from |
||
// Ignore mappings that are invalid, non-existent or are special pseudo-files | ||
continue | ||
} | ||
} else { | ||
|
@@ -229,20 +234,45 @@ func (sp *systemProcess) GetMappings() ([]Mapping, uint32, error) { | |
defer mapsFile.Close() | ||
|
||
mappings, numParseErrors, err := parseMappings(mapsFile) | ||
if err == nil { | ||
fileToMapping := make(map[string]*Mapping, len(mappings)) | ||
for idx := range mappings { | ||
m := &mappings[idx] | ||
if m.Inode == 0 { | ||
// Ignore mappings that are invalid, | ||
// non-existent or are special pseudo-files. | ||
continue | ||
} | ||
fileToMapping[m.Path] = m | ||
if err != nil { | ||
return mappings, numParseErrors, err | ||
} | ||
|
||
if len(mappings) == 0 { | ||
// We could test for main thread exit here by checking for zombie state | ||
// in /proc/sp.pid/stat but it's simpler to assume that this is the case | ||
// and try extracting mappings for a different thread. Since we stopped | ||
// processing /proc at agent startup, it's very unlikely that the agent | ||
// will sample a process being initialized without mappings. | ||
log.Warnf("PID: %v main thread exit", sp.pid) | ||
sp.mainThreadExit = true | ||
mapsFileAlt, err := os.Open(fmt.Sprintf("/proc/%d/task/%d/maps", sp.pid, sp.tid)) | ||
// On all errors resulting from trying to get mappings from a different thread, | ||
// return ErrNoMappings which will keep the PID tracked in processmanager and | ||
// allow for a future iteration to try extracting mappings from a different thread. | ||
// This is done to deal with race conditions triggered by thread exits (we do not want | ||
// the agent to unload process metadata when a thread exits but the process is still | ||
// alive). | ||
if err != nil { | ||
return mappings, numParseErrors, ErrNoMappings | ||
} | ||
defer mapsFileAlt.Close() | ||
|
||
numParseErrorsAlt := uint32(0) | ||
mappings, numParseErrorsAlt, err = parseMappings(mapsFileAlt) | ||
numParseErrors += numParseErrorsAlt | ||
if err != nil || len(mappings) == 0 { | ||
return mappings, numParseErrors, ErrNoMappings | ||
} | ||
sp.fileToMapping = fileToMapping | ||
} | ||
return mappings, numParseErrors, err | ||
|
||
fileToMapping := make(map[string]*Mapping, len(mappings)) | ||
for idx := range mappings { | ||
m := &mappings[idx] | ||
fileToMapping[m.Path] = m | ||
} | ||
sp.fileToMapping = fileToMapping | ||
return mappings, numParseErrors, nil | ||
} | ||
|
||
func (sp *systemProcess) GetThreads() ([]ThreadInfo, error) { | ||
|
@@ -271,6 +301,11 @@ func (sp *systemProcess) getMappingFile(m *Mapping) string { | |
if m.IsAnonymous() || m.IsVDSO() { | ||
return "" | ||
} | ||
if sp.mainThreadExit { | ||
// Neither /proc/sp.pid/map_files nor /proc/sp.pid/task/sp.tid/map_files | ||
// exist if main thread has exited, so we use the mapping path directly. | ||
return m.Path | ||
} | ||
return fmt.Sprintf("/proc/%v/map_files/%x-%x", sp.pid, m.Vaddr, m.Vaddr+m.Length) | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -538,7 +538,7 @@ func (pm *ProcessManager) synchronizeMappings(pr process.Process, | |
// fast enough and this particular pid is reused again by the system. | ||
func (pm *ProcessManager) processPIDExit(pid libpf.PID) { | ||
exitKTime := times.GetKTime() | ||
log.Debugf("- PID: %v", pid) | ||
log.Warnf("- PID: %v", pid) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'll remove these newly added warnings before merging, they should help with reviewing the PR as you don't need to run the agent with debug logs enabled and sort through a lot of irrelevant noise. |
||
|
||
var err error | ||
defer func() { | ||
|
@@ -614,9 +614,16 @@ func (pm *ProcessManager) SynchronizeProcess(pr process.Process) { | |
return | ||
} | ||
|
||
if errors.Is(err, process.ErrNoMappings) { | ||
// When no mappings can be extracted but the process is still alive, | ||
// do not trigger a process exit to avoid unloading process metadata. | ||
// As it's likely that a future iteration can extract mappings from a | ||
// different thread in the process, notify eBPF to enable further notifications. | ||
pm.ebpf.RemoveReportedPID(pid) | ||
return | ||
} | ||
|
||
// All other errors imply that the process has exited. | ||
// Clean up, and notify eBPF. | ||
pm.processPIDExit(pid) | ||
if os.IsNotExist(err) { | ||
// Since listing /proc and opening files in there later is inherently racy, | ||
// we expect to lose the race sometimes and thus expect to hit os.IsNotExist. | ||
|
@@ -626,22 +633,7 @@ func (pm *ProcessManager) SynchronizeProcess(pr process.Process) { | |
// return ESRCH. Handle it as if the process did not exist. | ||
pm.mappingStats.errProcESRCH.Add(1) | ||
} | ||
return | ||
} | ||
if len(mappings) == 0 { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These comments are no longer relevant. |
||
// Valid process without any (executable) mappings. All cases are | ||
// handled as process exit. Possible causes and reasoning: | ||
// 1. It is a kernel worker process. The eBPF does not send events from these, | ||
// but we can see kernel threads here during startup when tracer walks | ||
// /proc and tries to synchronize all PIDs it sees. | ||
// The PID should not exist anywhere, but we can still double check and | ||
// make sure the PID is not tracked. | ||
// 2. It is a normal process executing, but we just sampled it when the kernel | ||
// execve() is rebuilding the mappings and nothing is currently mapped. | ||
// In this case we can handle it as process exit because everything about | ||
// the process is changing: all mappings, comm, etc. If execve fails, we | ||
// reaped it early. If execve succeeds, we will get new synchronization | ||
// request soon, and handle it as a new process event. | ||
christos68k marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// Clean up, and notify eBPF. | ||
pm.processPIDExit(pid) | ||
return | ||
} | ||
|
@@ -744,6 +736,7 @@ func (pm *ProcessManager) ProcessedUntil(traceCaptureKTime times.KTime) { | |
continue | ||
} | ||
|
||
log.Warnf("PID %v deleted", pid) | ||
delete(pm.pidToProcessInfo, pid) | ||
|
||
for _, instance := range pm.interpreters[pid] { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I didn't switch
Process
to acceptlibpf.PIDTID
as the latter is only used with PID events, and I'd rather not couple it here too.