-
Notifications
You must be signed in to change notification settings - Fork 4.7k
WIP kubeapiserver auditloganalyzer: spot handler panics in audit log #29127
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
vrutkovs
wants to merge
1
commit into
openshift:main
Choose a base branch
from
vrutkovs:auditloganalyzer-handler-panics
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
137 changes: 137 additions & 0 deletions
137
pkg/monitortests/kubeapiserver/auditloganalyzer/handle_apiserver_paniced.go
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
package auditloganalyzer | ||
|
||
import ( | ||
"fmt" | ||
"sort" | ||
"strings" | ||
"sync" | ||
"time" | ||
|
||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
"k8s.io/apimachinery/pkg/types" | ||
"k8s.io/apimachinery/pkg/util/sets" | ||
auditv1 "k8s.io/apiserver/pkg/apis/audit/v1" | ||
) | ||
|
||
type panicEvent struct { | ||
auditID types.UID | ||
timestamp time.Time | ||
} | ||
|
||
func (p panicEvent) String() string { | ||
return fmt.Sprintf("auditID %s at %s", p.auditID, p.timestamp.String()) | ||
} | ||
|
||
type PanicEventByTimestamp []panicEvent | ||
|
||
func (n PanicEventByTimestamp) Len() int { | ||
return len(n) | ||
} | ||
func (n PanicEventByTimestamp) Swap(i, j int) { | ||
n[i], n[j] = n[j], n[i] | ||
} | ||
func (n PanicEventByTimestamp) Less(i, j int) bool { | ||
diff := n[i].timestamp.Compare(n[j].timestamp) | ||
switch { | ||
case diff < 0: | ||
return true | ||
case diff > 0: | ||
return false | ||
} | ||
|
||
return strings.Compare(string(n[i].auditID), string(n[j].auditID)) < 0 | ||
} | ||
|
||
type panicEventsForEndpoint struct { | ||
panicEvents map[string]sets.Set[panicEvent] | ||
} | ||
|
||
func NewPanicEventsForEndpoint() panicEventsForEndpoint { | ||
return panicEventsForEndpoint{ | ||
panicEvents: make(map[string]sets.Set[panicEvent]), | ||
} | ||
} | ||
|
||
func (p panicEventsForEndpoint) Insert(endpoint string, pe panicEvent) { | ||
events, ok := p.panicEvents[endpoint] | ||
if !ok { | ||
events = sets.New[panicEvent]() | ||
} | ||
events.Insert(pe) | ||
p.panicEvents[endpoint] = events | ||
} | ||
|
||
func (p panicEventsForEndpoint) String() string { | ||
result := "" | ||
for endpoint, events := range p.panicEvents { | ||
sortedEvents := events.UnsortedList() | ||
sort.Sort(PanicEventByTimestamp(sortedEvents)) | ||
eventsAsStrings := []string{} | ||
for _, event := range sortedEvents { | ||
eventsAsStrings = append(eventsAsStrings, event.String()) | ||
} | ||
eventString := fmt.Sprintf(" %s", strings.Join(eventsAsStrings, "\n ")) | ||
result = fmt.Sprintf("%s\nFound %d panics for endpoint %q:\n%s", result, len(events), endpoint, eventString) | ||
} | ||
return result | ||
} | ||
|
||
func (p panicEventsForEndpoint) Len() int { | ||
sum := 0 | ||
for _, endpoints := range p.panicEvents { | ||
sum += endpoints.Len() | ||
} | ||
return sum | ||
} | ||
|
||
type panicEventsForUserAgent struct { | ||
panicEvents map[string]panicEventsForEndpoint | ||
} | ||
|
||
func NewPanicEventsForUserAgent() panicEventsForUserAgent { | ||
return panicEventsForUserAgent{ | ||
panicEvents: make(map[string]panicEventsForEndpoint), | ||
} | ||
} | ||
|
||
func (p panicEventsForUserAgent) Insert(useragent string, endpoint string, pe panicEvent) { | ||
events, ok := p.panicEvents[useragent] | ||
if !ok { | ||
events = NewPanicEventsForEndpoint() | ||
} | ||
events.Insert(endpoint, pe) | ||
p.panicEvents[useragent] = events | ||
} | ||
|
||
type apiserverPaniced struct { | ||
lock sync.Mutex | ||
panicEventsPerUserAgent panicEventsForUserAgent | ||
} | ||
|
||
func CheckForApiserverPaniced() *apiserverPaniced { | ||
return &apiserverPaniced{ | ||
panicEventsPerUserAgent: NewPanicEventsForUserAgent(), | ||
} | ||
} | ||
|
||
func (s *apiserverPaniced) HandleAuditLogEvent(auditEvent *auditv1.Event, beginning, end *metav1.MicroTime) { | ||
if beginning != nil && auditEvent.RequestReceivedTimestamp.Before(beginning) || end != nil && end.Before(&auditEvent.RequestReceivedTimestamp) { | ||
return | ||
} | ||
|
||
if auditEvent.ResponseStatus == nil { | ||
return | ||
} | ||
if auditEvent.ResponseStatus.Code != 500 { | ||
return | ||
} | ||
|
||
s.lock.Lock() | ||
defer s.lock.Unlock() | ||
|
||
pe := panicEvent{ | ||
auditID: auditEvent.AuditID, | ||
timestamp: auditEvent.RequestReceivedTimestamp.Time, | ||
} | ||
s.panicEventsPerUserAgent.Insert(auditEvent.UserAgent, auditEvent.RequestURI, pe) | ||
} |
5 changes: 3 additions & 2 deletions
5
pkg/monitortests/kubeapiserver/auditloganalyzer/handle_excessive_applies.go
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,6 +31,7 @@ type auditLogAnalyzer struct { | |
invalidRequestsChecker *invalidRequests | ||
requestsDuringShutdownChecker *lateRequestTracking | ||
violationChecker *auditViolations | ||
apiserverPanicChecker *apiserverPaniced | ||
|
||
countsForInstall *CountsForRun | ||
} | ||
|
@@ -42,6 +43,7 @@ func NewAuditLogAnalyzer() monitortestframework.MonitorTest { | |
invalidRequestsChecker: CheckForInvalidMutations(), | ||
requestsDuringShutdownChecker: CheckForRequestsDuringShutdown(), | ||
violationChecker: CheckForViolations(), | ||
apiserverPanicChecker: CheckForApiserverPaniced(), | ||
} | ||
} | ||
|
||
|
@@ -85,6 +87,7 @@ func (w *auditLogAnalyzer) CollectData(ctx context.Context, storageDir string, b | |
w.invalidRequestsChecker, | ||
w.requestsDuringShutdownChecker, | ||
w.violationChecker, | ||
w.apiserverPanicChecker, | ||
} | ||
if w.requestCountTracking != nil { | ||
auditLogHandlers = append(auditLogHandlers, w.requestCountTracking) | ||
|
@@ -367,6 +370,37 @@ func (w *auditLogAnalyzer) EvaluateTestsFromConstructedIntervals(ctx context.Con | |
|
||
ret = append(ret, w.violationChecker.CreateJunits()...) | ||
|
||
for userAgent, userAgentPanics := range w.apiserverPanicChecker.panicEventsPerUserAgent.panicEvents { | ||
testName := fmt.Sprintf("user %s must not produce too many apiserver handler panics", userAgent) | ||
|
||
failures := []string{} | ||
if userAgentPanics.Len() > 5 { | ||
failures := append(failures, userAgentPanics.String()) | ||
ret = append(ret, | ||
&junitapi.JUnitTestCase{ | ||
Name: testName, | ||
FailureOutput: &junitapi.FailureOutput{ | ||
Message: strings.Join(failures, "\n"), | ||
Output: "details in audit log", | ||
}, | ||
}, | ||
) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add a success here so that we flake instead of fail to start. We can still search for flakes. |
||
// Add success again to make it a flake | ||
ret = append(ret, | ||
&junitapi.JUnitTestCase{ | ||
Name: testName, | ||
}, | ||
) | ||
} else { | ||
ret = append(ret, | ||
&junitapi.JUnitTestCase{ | ||
Name: testName, | ||
}, | ||
) | ||
} | ||
|
||
} | ||
|
||
return ret, nil | ||
} | ||
|
||
|
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is generating random test names, infinitely increasing the tests in sippy even as a presubmit. 150 tests so far. We'll clean those up but this needs to be a megatest, can't do a test per uncontrolled user inputs.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ex name | user ip-10-0-21-189/ovnkube@038aea1608e0 (linux/amd64) kubernetes/v0.31.1 must not produce too many apiserver handler panics
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Right, this may not have an authenticated user.
Should we use "too many apiserver handler panics produced" as testname and move useragent to the error message?
Perhaps, instead we could add username if available and use "unknown" otherwise? The usernames may be infinite too though
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It does seem dangerous even if just authed users, presumably tests could be making some with random names? Typically we would write a whitelist for known users you're particularly interested in broken into their own tests, plus a fallback catch all single test, or just the fallback single test.