Skip to content

Commit 6535b95

Browse files
committed
target: track explicit test cache counts
A user reported that the test grid could show cached tests as non-cached. That happened because test target history inferred cache state from timestamps and flattened multiple test attempts down to a lossy bool. This change keeps target history test-only, adds explicit local, remote, and total cached attempt counts to ClickHouse-backed test history, and updates the TAP grid to derive full versus partial cache state from those fields. The primary DB path stays on the legacy bool, and cached remains in TargetStatus as a fallback for older ClickHouse rows that do not have the new count columns yet. This requires an additive ClickHouse migration for TestTargetStatuses. Older rows still render via the legacy bool and timestamp heuristic until they are rewritten, while new rows carry the full cache-attempt semantics.
1 parent 9ae3bd7 commit 6535b95

8 files changed

Lines changed: 495 additions & 101 deletions

File tree

enterprise/app/tap/grid.tsx

Lines changed: 78 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,15 @@ interface Stat {
7979
cached: number;
8080
}
8181

82+
interface CacheInfo {
83+
fullyCached: boolean;
84+
partiallyCached: boolean;
85+
cachedCount: number;
86+
cachedLocallyCount: number;
87+
cachedRemotelyCount: number;
88+
totalRunCount: number;
89+
}
90+
8291
const Status = api.v1.Status;
8392

8493
const MIN_OPACITY = 0.1;
@@ -174,6 +183,7 @@ export default class TestGridComponent extends React.Component<Props, State> {
174183
for (let targetHistory of histories) {
175184
let stats: Stat = { count: 0, pass: 0, totalDuration: 0, maxDuration: 0, avgDuration: 0, flake: 0, cached: 0 };
176185
for (let status of targetHistory.targetStatus) {
186+
const cacheInfo = getCacheInfo(status);
177187
stats.count += 1;
178188
let duration = this.durationToNum(status.timing?.duration || undefined);
179189
stats.totalDuration += duration;
@@ -183,7 +193,7 @@ export default class TestGridComponent extends React.Component<Props, State> {
183193
} else if (status.status == Status.FLAKY) {
184194
stats.flake += 1;
185195
}
186-
if (isCached(status)) {
196+
if (cacheInfo.fullyCached) {
187197
stats.cached += 1;
188198
}
189199
}
@@ -525,6 +535,7 @@ export default class TestGridComponent extends React.Component<Props, State> {
525535
return (
526536
<div className="tap-commit-container">
527537
{statuses.map((status) => {
538+
const cacheInfo = getCacheInfo(status);
528539
let destinationUrl = `/invocation/${status.invocationId}?${new URLSearchParams({
529540
target: targetHistory.target?.label || "",
530541
targetStatus: String(status),
@@ -536,9 +547,9 @@ export default class TestGridComponent extends React.Component<Props, State> {
536547
title += ` at commit ${commitStatus.commitSha}`;
537548
}
538549

539-
let cached = isCached(status);
540-
if (cached) {
541-
title += ` (cached)`;
550+
const cacheLabel = describeCacheStatus(cacheInfo);
551+
if (cacheLabel) {
552+
title += ` (${cacheLabel})`;
542553
}
543554

544555
return (
@@ -560,7 +571,7 @@ export default class TestGridComponent extends React.Component<Props, State> {
560571
className={`tap-block ${
561572
this.getColorMode() == "status" ? `status-${status.status}` : "timing"
562573
}
563-
${cached ? `cached` : ""}
574+
${cacheInfo.fullyCached ? `cached` : ""}
564575
clickable`}>
565576
{this.statusToIcon(status.status || Status.STATUS_UNSPECIFIED)}
566577
</a>
@@ -584,7 +595,68 @@ export default class TestGridComponent extends React.Component<Props, State> {
584595
}
585596
}
586597

587-
function isCached(status: target.ITargetStatus) {
598+
function getCacheInfo(status: target.ITargetStatus): CacheInfo {
599+
const cachedLocallyCount = Number(status.cachedLocallyCount || 0);
600+
const cachedRemotelyCount = Number(status.cachedRemotelyCount || 0);
601+
const cachedCount = Math.max(Number(status.cachedCount || 0), cachedLocallyCount + cachedRemotelyCount);
602+
const explicitTotalRunCount = Number(status.totalRunCount || 0);
603+
604+
if (explicitTotalRunCount > 0 || cachedCount > 0 || cachedLocallyCount > 0 || cachedRemotelyCount > 0) {
605+
const totalRunCount = Math.max(explicitTotalRunCount, cachedCount);
606+
// When explicit counts are available, use them as the source of truth. The
607+
// legacy cached bool is only a fallback for older rows without counts.
608+
const fullyCached = explicitTotalRunCount > 0 ? cachedCount === totalRunCount : Boolean(status.cached);
609+
return {
610+
fullyCached,
611+
partiallyCached: !fullyCached && cachedCount > 0,
612+
cachedCount,
613+
cachedLocallyCount,
614+
cachedRemotelyCount,
615+
totalRunCount,
616+
};
617+
}
618+
619+
// Older ClickHouse rows only have the legacy cached bool, so preserve the
620+
// historical fallback instead of treating missing counts as uncached.
621+
const fullyCached = Boolean(status.cached) || isHeuristicallyCached(status);
622+
return {
623+
fullyCached,
624+
partiallyCached: false,
625+
cachedCount: fullyCached ? 1 : 0,
626+
cachedLocallyCount: 0,
627+
cachedRemotelyCount: 0,
628+
totalRunCount: fullyCached ? 1 : 0,
629+
};
630+
}
631+
632+
function describeCacheStatus(cacheInfo: CacheInfo): string {
633+
if (!cacheInfo.fullyCached && !cacheInfo.partiallyCached) {
634+
return "";
635+
}
636+
637+
const originParts = [];
638+
if (cacheInfo.cachedLocallyCount > 0) {
639+
originParts.push(`${cacheInfo.cachedLocallyCount} local`);
640+
}
641+
if (cacheInfo.cachedRemotelyCount > 0) {
642+
originParts.push(`${cacheInfo.cachedRemotelyCount} remote`);
643+
}
644+
const originSuffix = originParts.length ? ` (${originParts.join(", ")})` : "";
645+
646+
if (cacheInfo.fullyCached) {
647+
if (cacheInfo.cachedLocallyCount > 0 && cacheInfo.cachedLocallyCount === cacheInfo.totalRunCount) {
648+
return "cached locally";
649+
}
650+
if (cacheInfo.cachedRemotelyCount > 0 && cacheInfo.cachedRemotelyCount === cacheInfo.totalRunCount) {
651+
return "cached remotely";
652+
}
653+
return `cached${originSuffix}`;
654+
}
655+
656+
return `${cacheInfo.cachedCount}/${cacheInfo.totalRunCount} cached${originSuffix}`;
657+
}
658+
659+
function isHeuristicallyCached(status: target.ITargetStatus) {
588660
return (
589661
+(status.timing?.startTime?.seconds || 0) > 0 &&
590662
Math.floor(+(status.invocationCreatedAtUsec || 0) / 1000000) > +(status.timing?.startTime?.seconds || 0)

proto/target.proto

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,9 @@ message TargetMetadata {
3131
api.v1.TestSize test_size = 5;
3232
}
3333

34+
// Status for a target within target history. Today GetTargetHistory only
35+
// stores test targets, so the cache fields below describe test attempts rather
36+
// than generic build actions.
3437
message TargetStatus {
3538
// The invocation identifier itself.
3639
string invocation_id = 1;
@@ -51,6 +54,30 @@ message TargetStatus {
5154

5255
// When the invocation was created.
5356
int64 invocation_created_at_usec = 5;
57+
58+
// Whether all recorded test attempts for this target invocation were served
59+
// from cache. For new ClickHouse rows, this legacy field is derived from
60+
// total_run_count > 0 && cached_count == total_run_count. Older rows that
61+
// predate the explicit cache counts have total_run_count == 0, so clients can
62+
// fall back to this bool.
63+
bool cached = 6;
64+
65+
// The number of cached test attempts recorded for this target invocation,
66+
// including both local and remote cache hits. A non-zero cached_count does
67+
// not imply that all attempts were cached; compare against total_run_count.
68+
int32 cached_count = 7;
69+
70+
// The number of cached test attempts served from the local cache.
71+
int32 cached_locally_count = 8;
72+
73+
// The number of cached test attempts served from the remote cache.
74+
int32 cached_remotely_count = 9;
75+
76+
// The total number of cached and uncached test shard attempts recorded for
77+
// this target invocation. This comes from Bazel TestSummary.total_run_count;
78+
// compare cached_count against it to determine whether all attempts were
79+
// cached.
80+
int32 total_run_count = 10;
5481
}
5582

5683
message TargetHistory {

server/build_event_protocol/target_tracker/target_tracker.go

Lines changed: 65 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -49,18 +49,23 @@ const (
4949
)
5050

5151
type target struct {
52-
label string
53-
aspect string
54-
ruleType string
55-
firstStartTime time.Time
56-
totalDuration time.Duration
57-
state targetState
58-
id int64
59-
overallStatus build_event_stream.TestStatus
60-
cached bool
61-
targetType cmpb.TargetType
62-
testSize build_event_stream.TestSize
63-
buildSuccess bool
52+
label string
53+
aspect string
54+
ruleType string
55+
firstStartTime time.Time
56+
totalDuration time.Duration
57+
state targetState
58+
id int64
59+
overallStatus build_event_stream.TestStatus
60+
cached bool
61+
cachedCount int32
62+
cachedLocallyCount int32
63+
cachedRemotelyCount int32
64+
totalRunCount int32
65+
observedResultCount int32
66+
targetType cmpb.TargetType
67+
testSize build_event_stream.TestSize
68+
buildSuccess bool
6469
}
6570

6671
func md5Int64(text string) int64 {
@@ -76,6 +81,23 @@ func newTarget(label string, aspect string) *target {
7681
}
7782
}
7883

84+
func (t *target) effectiveCachedCount() int32 {
85+
count := t.cachedCount
86+
if observed := t.cachedLocallyCount + t.cachedRemotelyCount; observed > count {
87+
count = observed
88+
}
89+
return count
90+
}
91+
92+
func (t *target) effectiveTotalRunCount(ctx context.Context) int32 {
93+
count := t.totalRunCount
94+
if t.observedResultCount > count {
95+
log.CtxDebugf(ctx, "Using observed TestResult count for target %q because it exceeds TestSummary total_run_count (%d > %d)", t.label, t.observedResultCount, t.totalRunCount)
96+
count = t.observedResultCount
97+
}
98+
return count
99+
}
100+
79101
func getTestStatus(aborted *build_event_stream.Aborted) build_event_stream.TestStatus {
80102
switch aborted.GetReason() {
81103
case build_event_stream.Aborted_USER_INTERRUPTED:
@@ -99,11 +121,22 @@ func (t *target) updateFromEvent(event *build_event_stream.BuildEvent) {
99121
}
100122
t.state = targetStateCompleted
101123
case *build_event_stream.BuildEvent_TestResult:
102-
t.cached = p.TestResult.GetCachedLocally() || p.TestResult.GetExecutionInfo().GetCachedRemotely()
124+
cachedLocally := p.TestResult.GetCachedLocally()
125+
cachedRemotely := p.TestResult.GetExecutionInfo().GetCachedRemotely()
126+
t.cached = cachedLocally || cachedRemotely
127+
t.observedResultCount++
128+
if cachedLocally {
129+
t.cachedLocallyCount++
130+
}
131+
if cachedRemotely {
132+
t.cachedRemotelyCount++
133+
}
103134
t.state = targetStateResult
104135
case *build_event_stream.BuildEvent_TestSummary:
105136
ts := p.TestSummary
106137
t.overallStatus = ts.GetOverallStatus()
138+
t.cachedCount = ts.GetTotalNumCached()
139+
t.totalRunCount = ts.GetTotalRunCount()
107140
t.firstStartTime = timeutil.GetTimeWithFallback(ts.GetFirstStartTime(), ts.GetFirstStartTimeMillis())
108141
t.totalDuration = timeutil.GetDurationWithFallback(ts.GetTotalRunDuration(), ts.GetTotalRunDurationMillis())
109142
t.state = targetStateSummary
@@ -356,25 +389,32 @@ func (t *TargetTracker) writeTestTargetStatusesToOLAPDB(ctx context.Context, per
356389
testStartTimeUsec = target.firstStartTime.UnixMicro()
357390
}
358391

392+
cachedCount := target.effectiveCachedCount()
393+
totalRunCount := target.effectiveTotalRunCount(ctx)
394+
359395
entries = append(entries, &schema.TestTargetStatus{
360396
GroupID: permissions.GroupID,
361397
RepoURL: repoURL,
362398
CommitSHA: commitSHA,
363399
Label: target.label,
364400
InvocationStartTimeUsec: invocationStartTime.UnixMicro(),
365401

366-
RuleType: target.ruleType,
367-
UserID: permissions.UserID,
368-
InvocationUUID: invocationUUID,
369-
TargetType: int32(target.targetType),
370-
TestSize: int32(target.testSize),
371-
Status: int32(target.overallStatus),
372-
Cached: target.cached,
373-
StartTimeUsec: testStartTimeUsec,
374-
DurationUsec: target.totalDuration.Microseconds(),
375-
BranchName: t.buildEventAccumulator.Invocation().GetBranchName(),
376-
Role: t.buildEventAccumulator.Invocation().GetRole(),
377-
Command: t.buildEventAccumulator.Invocation().GetCommand(),
402+
RuleType: target.ruleType,
403+
UserID: permissions.UserID,
404+
InvocationUUID: invocationUUID,
405+
TargetType: int32(target.targetType),
406+
TestSize: int32(target.testSize),
407+
Status: int32(target.overallStatus),
408+
Cached: totalRunCount > 0 && cachedCount == totalRunCount,
409+
CachedCount: cachedCount,
410+
CachedLocallyCount: target.cachedLocallyCount,
411+
CachedRemotelyCount: target.cachedRemotelyCount,
412+
TotalRunCount: totalRunCount,
413+
StartTimeUsec: testStartTimeUsec,
414+
DurationUsec: target.totalDuration.Microseconds(),
415+
BranchName: t.buildEventAccumulator.Invocation().GetBranchName(),
416+
Role: t.buildEventAccumulator.Invocation().GetRole(),
417+
Command: t.buildEventAccumulator.Invocation().GetCommand(),
378418
})
379419
}
380420
err := t.env.GetOLAPDBHandle().FlushTestTargetStatuses(ctx, entries)

0 commit comments

Comments
 (0)