Skip to content

Commit 2073870

Browse files
authored
[Monitoring] Testcase upload metrics for the triage lifecycle (#4364)
### Motivation Chrome security shepherds manually upload testcases through appengine, triggering analyze task and, in case of a legitimate crash, the followup progression tasks: * Minimize * Analyze * Impact * Regression * Cleanup cronjob, when updating a bug to inform the user that all above stages were finished This PR adds instrumentation to track the time elapsed between the user upload, and the completion of the above events. ### Attention points * TestcaseUploadMetadata.timestamp was being mutated on the preprocess stage for analyze task. This mutation was removed, so that this entity can be the source of truth for when a testcase was in fact uploaded by the user. * The job name could be retrieved from the JOB_NAME env var within the uworker, however this does not work for the cleanup use case. For this reason, the job name is fetched from datastore instead. * The ```query_testcase_upload_metadata``` method was moved from analyze_task.py to a helpers file, so it could be reused across tasks and on the cleanup cronjob ### Testing strategy Every task mentioned was executed locally, with a valid uploaded testcase. The codepath for the metric emission was hit and produced the desired output, both for the tasks and the cronjob. Part of #4271
1 parent 9ee9387 commit 2073870

File tree

7 files changed

+125
-17
lines changed

7 files changed

+125
-17
lines changed

src/clusterfuzz/_internal/bot/tasks/impact_task.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from clusterfuzz._internal.build_management import build_manager
1919
from clusterfuzz._internal.build_management import revisions
2020
from clusterfuzz._internal.chrome import build_info
21+
from clusterfuzz._internal.common import testcase_utils
2122
from clusterfuzz._internal.datastore import data_handler
2223
from clusterfuzz._internal.datastore import data_types
2324
from clusterfuzz._internal.metrics import logs
@@ -326,4 +327,7 @@ def execute_task(testcase_id, job_type):
326327
impacts = get_impacts_from_url(testcase.regression, testcase.job_type)
327328
testcase = data_handler.get_testcase_by_id(testcase_id)
328329
set_testcase_with_impacts(testcase, impacts)
330+
testcase_utils.emit_testcase_triage_duration_metric(
331+
testcase_id,
332+
testcase_utils.TESTCASE_TRIAGE_DURATION_IMPACT_COMPLETED_STEP)
329333
data_handler.update_testcase_comment(testcase, data_types.TaskState.FINISHED)

src/clusterfuzz/_internal/bot/tasks/utasks/analyze_task.py

Lines changed: 19 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
# limitations under the License.
1414
"""Analyze task for handling user uploads."""
1515

16-
import datetime
1716
import json
1817
from typing import Dict
1918
from typing import Optional
@@ -28,6 +27,7 @@
2827
from clusterfuzz._internal.bot.tasks.utasks import uworker_io
2928
from clusterfuzz._internal.build_management import build_manager
3029
from clusterfuzz._internal.build_management import revisions
30+
from clusterfuzz._internal.common import testcase_utils
3131
from clusterfuzz._internal.crash_analysis import crash_analyzer
3232
from clusterfuzz._internal.crash_analysis import severity_analyzer
3333
from clusterfuzz._internal.datastore import data_handler
@@ -119,7 +119,7 @@ def handle_analyze_no_revision_index(output):
119119

120120
def handle_analyze_close_invalid_uploaded(output):
121121
testcase = data_handler.get_testcase_by_id(output.uworker_input.testcase_id)
122-
testcase_upload_metadata = query_testcase_upload_metadata(
122+
testcase_upload_metadata = testcase_utils.get_testcase_upload_metadata(
123123
output.uworker_input.testcase_id)
124124
data_handler.close_invalid_uploaded_testcase(
125125
testcase, testcase_upload_metadata, 'Irrelevant')
@@ -259,7 +259,7 @@ def handle_noncrash(output):
259259
tasks.add_task('analyze', output.uworker_input.testcase_id,
260260
output.uworker_input.job_type)
261261
return
262-
testcase_upload_metadata = query_testcase_upload_metadata(
262+
testcase_upload_metadata = testcase_utils.get_testcase_upload_metadata(
263263
output.uworker_input.testcase_id)
264264
data_handler.mark_invalid_uploaded_testcase(
265265
testcase, testcase_upload_metadata, 'Unreproducible')
@@ -299,17 +299,24 @@ def utask_preprocess(testcase_id, job_type, uworker_env):
299299
testcase = data_handler.get_testcase_by_id(testcase_id)
300300
data_handler.update_testcase_comment(testcase, data_types.TaskState.STARTED)
301301

302-
testcase_upload_metadata = query_testcase_upload_metadata(testcase_id)
302+
testcase_upload_metadata = testcase_utils.get_testcase_upload_metadata(
303+
testcase_id)
303304
if not testcase_upload_metadata:
304305
logs.error('Testcase %s has no associated upload metadata.' % testcase_id)
305306
testcase.key.delete()
306307
return None
307308

308309
# Store the bot name and timestamp in upload metadata.
309310
testcase_upload_metadata.bot_name = environment.get_value('BOT_NAME')
310-
testcase_upload_metadata.timestamp = datetime.datetime.utcnow()
311311
testcase_upload_metadata.put()
312312

313+
# Emmits a TESTCASE_TRIAGE_DURATION metric, in order to track the time
314+
# elapsed between testcase upload and pulling the task from the queue.
315+
316+
testcase_utils.emit_testcase_triage_duration_metric(
317+
int(testcase_id),
318+
testcase_utils.TESTCASE_TRIAGE_DURATION_ANALYZE_LAUNCHED_STEP)
319+
313320
initialize_testcase_for_main(testcase, job_type)
314321

315322
setup_input = setup.preprocess_setup_testcase(testcase, uworker_env)
@@ -481,7 +488,7 @@ def handle_build_setup_error(output):
481488
output.uworker_input.job_type,
482489
wait_time=testcase_fail_wait)
483490
return
484-
testcase_upload_metadata = query_testcase_upload_metadata(
491+
testcase_upload_metadata = testcase_utils.get_testcase_upload_metadata(
485492
output.uworker_input.testcase_id)
486493
data_handler.mark_invalid_uploaded_testcase(
487494
testcase, testcase_upload_metadata, 'Build setup failed')
@@ -552,13 +559,16 @@ def _update_testcase(output):
552559
def utask_postprocess(output):
553560
"""Trusted: Cleans up after a uworker execute_task, writing anything needed to
554561
the db."""
562+
testcase = data_handler.get_testcase_by_id(output.uworker_input.testcase_id)
563+
testcase_upload_metadata = testcase_utils.get_testcase_upload_metadata(
564+
output.uworker_input.testcase_id)
565+
testcase_utils.emit_testcase_triage_duration_metric(
566+
int(output.uworker_input.testcase_id),
567+
testcase_utils.TESTCASE_TRIAGE_DURATION_ANALYZE_COMPLETED_STEP)
555568
_update_testcase(output)
556569
if output.error_type != uworker_msg_pb2.ErrorType.NO_ERROR: # pylint: disable=no-member
557570
_ERROR_HANDLER.handle(output)
558571
return
559-
testcase = data_handler.get_testcase_by_id(output.uworker_input.testcase_id)
560-
testcase_upload_metadata = query_testcase_upload_metadata(
561-
output.uworker_input.testcase_id)
562572

563573
log_message = (f'Testcase crashed in {output.test_timeout} seconds '
564574
f'(r{testcase.crash_revision})')
@@ -612,9 +622,3 @@ def utask_postprocess(output):
612622
# 5. Get second stacktrace from another job in case of
613623
# one-time crashes (stack).
614624
task_creation.create_tasks(testcase)
615-
616-
617-
def query_testcase_upload_metadata(
618-
testcase_id: str) -> Optional[data_types.TestcaseUploadMetadata]:
619-
return data_types.TestcaseUploadMetadata.query(
620-
data_types.TestcaseUploadMetadata.testcase_id == int(testcase_id)).get()

src/clusterfuzz/_internal/bot/tasks/utasks/minimize_task.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
from clusterfuzz._internal.bot.tokenizer.grammars.JavaScriptLexer import \
4545
JavaScriptLexer
4646
from clusterfuzz._internal.build_management import build_manager
47+
from clusterfuzz._internal.common import testcase_utils
4748
from clusterfuzz._internal.crash_analysis import severity_analyzer
4849
from clusterfuzz._internal.crash_analysis.crash_comparer import CrashComparer
4950
from clusterfuzz._internal.crash_analysis.crash_result import CrashResult
@@ -834,6 +835,9 @@ def finalize_testcase(testcase_id, last_crash_result_dict, flaky_stack=False):
834835

835836
def utask_postprocess(output):
836837
"""Postprocess in a trusted bot."""
838+
testcase_utils.emit_testcase_triage_duration_metric(
839+
int(output.uworker_input.testcase_id),
840+
testcase_utils.TESTCASE_TRIAGE_DURATION_MINIMIZE_COMPLETED_STEP)
837841
update_testcase(output)
838842
_cleanup_unused_blobs_from_storage(output)
839843
if output.error_type != uworker_msg_pb2.ErrorType.NO_ERROR: # pylint: disable=no-member

src/clusterfuzz/_internal/bot/tasks/utasks/regression_task.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
from clusterfuzz._internal.bot.tasks.utasks import uworker_io
2929
from clusterfuzz._internal.build_management import build_manager
3030
from clusterfuzz._internal.build_management import revisions
31+
from clusterfuzz._internal.common import testcase_utils
3132
from clusterfuzz._internal.datastore import data_handler
3233
from clusterfuzz._internal.datastore import data_types
3334
from clusterfuzz._internal.google_cloud_utils import big_query
@@ -583,12 +584,16 @@ def utask_postprocess(output: uworker_msg_pb2.Output) -> None: # pylint: disabl
583584
584585
Runs on a trusted worker.
585586
"""
587+
testcase_id = output.uworker_input.testcase_id
588+
testcase_utils.emit_testcase_triage_duration_metric(
589+
int(testcase_id),
590+
testcase_utils.TESTCASE_TRIAGE_DURATION_REGRESSION_COMPLETED_STEP)
591+
586592
if output.HasField('regression_task_output'):
587593
task_output = output.regression_task_output
588594
_update_build_metadata(output.uworker_input.job_type,
589595
task_output.build_data_list)
590-
_save_current_regression_range_indices(task_output,
591-
output.uworker_input.testcase_id)
596+
_save_current_regression_range_indices(task_output, testcase_id)
592597
if task_output.is_testcase_reproducible:
593598
# Clear metadata from previous runs had it been marked as potentially
594599
# flaky.
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
# Copyright 2024 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
"""Holds helpers for reuse across different tasks."""
15+
16+
import datetime
17+
import os
18+
from typing import Optional
19+
20+
from clusterfuzz._internal.datastore import data_handler
21+
from clusterfuzz._internal.datastore import data_types
22+
from clusterfuzz._internal.metrics import logs
23+
from clusterfuzz._internal.metrics import monitoring_metrics
24+
25+
TESTCASE_TRIAGE_DURATION_ANALYZE_LAUNCHED_STEP = 'analyze_launched'
26+
TESTCASE_TRIAGE_DURATION_IMPACT_COMPLETED_STEP = 'impact_completed'
27+
TESTCASE_TRIAGE_DURATION_ANALYZE_COMPLETED_STEP = 'analyze_completed'
28+
TESTCASE_TRIAGE_DURATION_MINIMIZE_COMPLETED_STEP = 'minimize_completed'
29+
TESTCASE_TRIAGE_DURATION_REGRESSION_COMPLETED_STEP = 'regression_completed'
30+
TESTCASE_TRIAGE_DURATION_ISSUE_UPDATED_STEP = 'issue_updated'
31+
32+
33+
def emit_testcase_triage_duration_metric(testcase_id: int, step: str):
34+
testcase_upload_metadata = get_testcase_upload_metadata(testcase_id)
35+
if not testcase_upload_metadata:
36+
logs.warning(f'No upload metadata found for testcase {testcase_id},'
37+
' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
38+
return
39+
if not testcase_upload_metadata.timestamp:
40+
logs.warning(
41+
f'No timestamp for testcase {testcase_upload_metadata.testcase_id},'
42+
' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
43+
return
44+
assert step in [
45+
'analyze_launched', 'analyze_completed', 'minimize_completed',
46+
'regression_completed', 'impact_completed', 'issue_updated'
47+
]
48+
elapsed_time_since_upload = datetime.datetime.utcnow()
49+
elapsed_time_since_upload -= testcase_upload_metadata.timestamp
50+
elapsed_time_since_upload = elapsed_time_since_upload.total_seconds()
51+
52+
testcase = data_handler.get_testcase_by_id(testcase_id)
53+
54+
if not testcase:
55+
logs.warning(f'No testcase found with id {testcase_id},'
56+
' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
57+
return
58+
59+
if not testcase.job_type:
60+
logs.warning(f'No job_type associated to testcase {testcase_id},'
61+
' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
62+
return
63+
64+
monitoring_metrics.TESTCASE_UPLOAD_TRIAGE_DURATION.add(
65+
elapsed_time_since_upload,
66+
labels={
67+
'job': testcase.job_type,
68+
'step': step,
69+
})
70+
71+
72+
def get_testcase_upload_metadata(
73+
testcase_id) -> Optional[data_types.TestcaseUploadMetadata]:
74+
return data_types.TestcaseUploadMetadata.query(
75+
data_types.TestcaseUploadMetadata.testcase_id == int(testcase_id)).get()

src/clusterfuzz/_internal/cron/cleanup.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
from clusterfuzz._internal.base import memoize
2525
from clusterfuzz._internal.base import utils
2626
from clusterfuzz._internal.chrome import build_info
27+
from clusterfuzz._internal.common import testcase_utils
2728
from clusterfuzz._internal.crash_analysis import crash_comparer
2829
from clusterfuzz._internal.crash_analysis import severity_analyzer
2930
from clusterfuzz._internal.cron.libs import mail
@@ -911,6 +912,11 @@ def _update_issue_when_uploaded_testcase_is_processed(
911912
policy, testcase, issue)
912913
issue.save(new_comment=comment, notify=notify)
913914

915+
# Testcase is a data_types.Testcase
916+
testcase_id = testcase.key.id()
917+
testcase_utils.emit_testcase_triage_duration_metric(
918+
testcase_id, testcase_utils.TESTCASE_TRIAGE_DURATION_ISSUE_UPDATED_STEP)
919+
914920

915921
def notify_uploader_when_testcase_is_processed(policy, testcase, issue):
916922
"""Notify uploader by email when all the testcase tasks are finished."""

src/clusterfuzz/_internal/metrics/monitoring_metrics.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,16 @@
207207
],
208208
)
209209

210+
TESTCASE_UPLOAD_TRIAGE_DURATION = monitor.CumulativeDistributionMetric(
211+
'uploaded_testcase_analysis/triage_duration_secs',
212+
description=('Time elapsed between testcase upload and completion'
213+
' of relevant tasks in the testcase upload lifecycle.'),
214+
bucketer=monitor.GeometricBucketer(),
215+
field_spec=[
216+
monitor.StringField('step'),
217+
monitor.StringField('job'),
218+
],
219+
)
210220
TASK_RATE_LIMIT_COUNT = monitor.CounterMetric(
211221
'task/rate_limit',
212222
description=('Counter for rate limit events.'),

0 commit comments

Comments
 (0)