Skip to content

WIP kubeapiserver auditloganalyzer: spot handler panics in audit log #29127

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
package auditloganalyzer

import (
"fmt"
"sort"
"strings"
"sync"
"time"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/sets"
auditv1 "k8s.io/apiserver/pkg/apis/audit/v1"
)

type panicEvent struct {
auditID types.UID
timestamp time.Time
}

func (p panicEvent) String() string {
return fmt.Sprintf("auditID %s at %s", p.auditID, p.timestamp.String())
}

type PanicEventByTimestamp []panicEvent

func (n PanicEventByTimestamp) Len() int {
return len(n)
}
func (n PanicEventByTimestamp) Swap(i, j int) {
n[i], n[j] = n[j], n[i]
}
func (n PanicEventByTimestamp) Less(i, j int) bool {
diff := n[i].timestamp.Compare(n[j].timestamp)
switch {
case diff < 0:
return true
case diff > 0:
return false
}

return strings.Compare(string(n[i].auditID), string(n[j].auditID)) < 0
}

type panicEventsForEndpoint struct {
panicEvents map[string]sets.Set[panicEvent]
}

func NewPanicEventsForEndpoint() panicEventsForEndpoint {
return panicEventsForEndpoint{
panicEvents: make(map[string]sets.Set[panicEvent]),
}
}

func (p panicEventsForEndpoint) Insert(endpoint string, pe panicEvent) {
events, ok := p.panicEvents[endpoint]
if !ok {
events = sets.New[panicEvent]()
}
events.Insert(pe)
p.panicEvents[endpoint] = events
}

func (p panicEventsForEndpoint) String() string {
result := ""
for endpoint, events := range p.panicEvents {
sortedEvents := events.UnsortedList()
sort.Sort(PanicEventByTimestamp(sortedEvents))
eventsAsStrings := []string{}
for _, event := range sortedEvents {
eventsAsStrings = append(eventsAsStrings, event.String())
}
eventString := fmt.Sprintf(" %s", strings.Join(eventsAsStrings, "\n "))
result = fmt.Sprintf("%s\nFound %d panics for endpoint %q:\n%s", result, len(events), endpoint, eventString)
}
return result
}

func (p panicEventsForEndpoint) Len() int {
sum := 0
for _, endpoints := range p.panicEvents {
sum += endpoints.Len()
}
return sum
}

type panicEventsForUserAgent struct {
panicEvents map[string]panicEventsForEndpoint
}

func NewPanicEventsForUserAgent() panicEventsForUserAgent {
return panicEventsForUserAgent{
panicEvents: make(map[string]panicEventsForEndpoint),
}
}

func (p panicEventsForUserAgent) Insert(useragent string, endpoint string, pe panicEvent) {
events, ok := p.panicEvents[useragent]
if !ok {
events = NewPanicEventsForEndpoint()
}
events.Insert(endpoint, pe)
p.panicEvents[useragent] = events
}

type apiserverPaniced struct {
lock sync.Mutex
panicEventsPerUserAgent panicEventsForUserAgent
}

func CheckForApiserverPaniced() *apiserverPaniced {
return &apiserverPaniced{
panicEventsPerUserAgent: NewPanicEventsForUserAgent(),
}
}

func (s *apiserverPaniced) HandleAuditLogEvent(auditEvent *auditv1.Event, beginning, end *metav1.MicroTime) {
if beginning != nil && auditEvent.RequestReceivedTimestamp.Before(beginning) || end != nil && end.Before(&auditEvent.RequestReceivedTimestamp) {
return
}

if auditEvent.ResponseStatus == nil {
return
}
if auditEvent.ResponseStatus.Code != 500 {
return
}

s.lock.Lock()
defer s.lock.Unlock()

pe := panicEvent{
auditID: auditEvent.AuditID,
timestamp: auditEvent.RequestReceivedTimestamp.Time,
}
s.panicEventsPerUserAgent.Insert(auditEvent.UserAgent, auditEvent.RequestURI, pe)
}
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
package auditloganalyzer

import (
"strings"
"sync"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
auditv1 "k8s.io/apiserver/pkg/apis/audit/v1"
"k8s.io/apiserver/pkg/authentication/serviceaccount"
"strings"
"sync"
)

type excessiveApplies struct {
Expand Down
34 changes: 34 additions & 0 deletions pkg/monitortests/kubeapiserver/auditloganalyzer/monitortest.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ type auditLogAnalyzer struct {
invalidRequestsChecker *invalidRequests
requestsDuringShutdownChecker *lateRequestTracking
violationChecker *auditViolations
apiserverPanicChecker *apiserverPaniced

countsForInstall *CountsForRun
}
Expand All @@ -42,6 +43,7 @@ func NewAuditLogAnalyzer() monitortestframework.MonitorTest {
invalidRequestsChecker: CheckForInvalidMutations(),
requestsDuringShutdownChecker: CheckForRequestsDuringShutdown(),
violationChecker: CheckForViolations(),
apiserverPanicChecker: CheckForApiserverPaniced(),
}
}

Expand Down Expand Up @@ -85,6 +87,7 @@ func (w *auditLogAnalyzer) CollectData(ctx context.Context, storageDir string, b
w.invalidRequestsChecker,
w.requestsDuringShutdownChecker,
w.violationChecker,
w.apiserverPanicChecker,
}
if w.requestCountTracking != nil {
auditLogHandlers = append(auditLogHandlers, w.requestCountTracking)
Expand Down Expand Up @@ -367,6 +370,37 @@ func (w *auditLogAnalyzer) EvaluateTestsFromConstructedIntervals(ctx context.Con

ret = append(ret, w.violationChecker.CreateJunits()...)

for userAgent, userAgentPanics := range w.apiserverPanicChecker.panicEventsPerUserAgent.panicEvents {
testName := fmt.Sprintf("user %s must not produce too many apiserver handler panics", userAgent)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is generating random test names, infinitely increasing the tests in sippy even as a presubmit. 150 tests so far. We'll clean those up but this needs to be a megatest, can't do a test per uncontrolled user inputs.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ex name | user ip-10-0-21-189/ovnkube@038aea1608e0 (linux/amd64) kubernetes/v0.31.1 must not produce too many apiserver handler panics

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, this may not have an authenticated user.
Should we use "too many apiserver handler panics produced" as testname and move useragent to the error message?
Perhaps, instead we could add username if available and use "unknown" otherwise? The usernames may be infinite too though

Copy link
Contributor

@dgoodwin dgoodwin Jan 24, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It does seem dangerous even if just authed users, presumably tests could be making some with random names? Typically we would write a whitelist for known users you're particularly interested in broken into their own tests, plus a fallback catch all single test, or just the fallback single test.


failures := []string{}
if userAgentPanics.Len() > 5 {
failures := append(failures, userAgentPanics.String())
ret = append(ret,
&junitapi.JUnitTestCase{
Name: testName,
FailureOutput: &junitapi.FailureOutput{
Message: strings.Join(failures, "\n"),
Output: "details in audit log",
},
},
)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a success here so that we flake instead of fail to start. We can still search for flakes.

// Add success again to make it a flake
ret = append(ret,
&junitapi.JUnitTestCase{
Name: testName,
},
)
} else {
ret = append(ret,
&junitapi.JUnitTestCase{
Name: testName,
},
)
}

}

return ret, nil
}

Expand Down