Skip to content

Commit 96a2136

Browse files
committed
OCPBUGS-62269 - Fix race condition causing missing audit log entries for rapid commands
OCPBUGS-62269 - Fix GitHub Actions linting issues
1 parent d0a182b commit 96a2136

File tree

1 file changed

+81
-2
lines changed

1 file changed

+81
-2
lines changed

internal/pkg/daemon/enricher/jsonenricher.go

Lines changed: 81 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -204,7 +204,7 @@ func (e *JsonEnricher) Run(ctx context.Context, runErr chan<- error) {
204204

205205
e.logger.V(config.VerboseLevel).Info("Emit audit log for process",
206206
"pid", logItem.Key())
207-
e.dispatchSeccompLine(auditLogBucket, nodeName)
207+
e.dispatchSeccompLine(ctx, auditLogBucket, logItem.Key(), nodeName)
208208
})
209209

210210
e.logger.Info(fmt.Sprintf("Setting up caches with expiry of %v", defaultCacheTimeout))
@@ -411,8 +411,80 @@ func (e *JsonEnricher) fetchProcessInfo(processId int, executable string, uid, g
411411
return processInfo
412412
}
413413

414+
// retryBpfCmdLine retries fetching cmdLine from BPF cache with progressive delays.
415+
// This handles the race condition where audit events arrive before BPF events are processed.
416+
func (e *JsonEnricher) retryBpfCmdLine(logBucket *types.LogBucket) {
417+
if e.bpfProcessCache == nil || logBucket.ProcessInfo.CmdLine != "" || logBucket.ProcessInfo.Pid == 0 {
418+
return
419+
}
420+
421+
maxRetries := 3
422+
retryDelays := []time.Duration{0, 10 * time.Millisecond, 50 * time.Millisecond}
423+
424+
for attempt := 0; attempt < maxRetries && logBucket.ProcessInfo.CmdLine == ""; attempt++ {
425+
if attempt > 0 {
426+
time.Sleep(retryDelays[attempt])
427+
}
428+
429+
cmdLine, errCmdLine := e.bpfProcessCache.GetCmdLine(logBucket.ProcessInfo.Pid)
430+
if errCmdLine == nil {
431+
logBucket.ProcessInfo.CmdLine = cmdLine
432+
e.logger.V(1).Info("cmdline found in eBPF on retry",
433+
"processId", logBucket.ProcessInfo.Pid, "cmdLine", cmdLine, "attempt", attempt+1)
434+
435+
break
436+
}
437+
}
438+
}
439+
440+
// retryBpfRequestUID retries fetching requestUID from BPF environment with progressive delays.
441+
func (e *JsonEnricher) retryBpfRequestUID(logBucket *types.LogBucket) {
442+
if e.bpfProcessCache == nil || logBucket.ProcessInfo.ExecRequestId != nil {
443+
return
444+
}
445+
446+
maxRetries := 3
447+
retryDelays := []time.Duration{0, 10 * time.Millisecond, 50 * time.Millisecond}
448+
449+
for attempt := 0; attempt < maxRetries && logBucket.ProcessInfo.ExecRequestId == nil; attempt++ {
450+
if attempt > 0 {
451+
time.Sleep(retryDelays[attempt])
452+
}
453+
454+
procEnv, errEnv := e.bpfProcessCache.GetEnv(logBucket.ProcessInfo.Pid)
455+
if errEnv == nil {
456+
reqId, ok := procEnv[requestIdEnv]
457+
if ok {
458+
logBucket.ProcessInfo.ExecRequestId = &reqId
459+
e.logger.V(1).Info("exec request id found in eBPF on retry", "reqId", reqId,
460+
"processId", logBucket.ProcessInfo.Pid, "attempt", attempt+1)
461+
462+
break
463+
}
464+
}
465+
}
466+
}
467+
468+
// retryContainerInfo retries fetching container info if it's missing.
469+
func (e *JsonEnricher) retryContainerInfo(ctx context.Context, logBucket *types.LogBucket, processID int, nodeName string) {
470+
if logBucket.ContainerInfo != nil {
471+
return
472+
}
473+
474+
e.logger.V(config.VerboseLevel).Info("Container info not found, retrying",
475+
"processID", processID)
476+
477+
logBucket.ContainerInfo = e.fetchContainerInfo(ctx, processID, nodeName)
478+
if logBucket.ContainerInfo != nil {
479+
e.logger.V(1).Info("Container info found on retry",
480+
"processID", processID,
481+
"pod", logBucket.ContainerInfo.PodName,
482+
"namespace", logBucket.ContainerInfo.Namespace)
483+
}
484+
}
485+
414486
func (e *JsonEnricher) dispatchSeccompLine(
415-
logBucket *types.LogBucket, nodeName string,
487+
ctx context.Context, logBucket *types.LogBucket, processID int, nodeName string,
416488
) {
417489
var syscallNames []string
418490

@@ -443,6 +515,13 @@ func (e *JsonEnricher) dispatchSeccompLine(
443515
return
444516
}
445517

518+
// Retry BPF cache lookups with progressive delays to allow ring buffer polling to catch up
519+
e.retryBpfCmdLine(logBucket)
520+
e.retryBpfRequestUID(logBucket)
521+
522+
// Retry container info lookup if it's still missing
523+
e.retryContainerInfo(ctx, logBucket, processID, nodeName)
524+
446525
if logBucket.ContainerInfo == nil {
447526
e.logger.V(config.VerboseLevel).Info("Container info not found in cache")
448527
}

0 commit comments

Comments
 (0)