[Monitoring] Testcase upload metrics for the triage lifecycle (#4364)

vitorguidi · web-flow · commit 2073870bc84d · 2024-11-13T12:37:36.000-03:00
### Motivation Chrome security shepherds manually upload testcases through appengine, triggering analyze task and, in case of a legitimate crash, the followup progression tasks: * Minimize * Analyze * Impact * Regression * Cleanup cronjob, when updating a bug to inform the user that all above stages were finished This PR adds instrumentation to track the time elapsed between the user upload, and the completion of the above events. ### Attention points * TestcaseUploadMetadata.timestamp was being mutated on the preprocess stage for analyze task. This mutation was removed, so that this entity can be the source of truth for when a testcase was in fact uploaded by the user. * The job name could be retrieved from the JOB_NAME env var within the uworker, however this does not work for the cleanup use case. For this reason, the job name is fetched from datastore instead. * The ```query_testcase_upload_metadata``` method was moved from analyze_task.py to a helpers file, so it could be reused across tasks and on the cleanup cronjob ### Testing strategy Every task mentioned was executed locally, with a valid uploaded testcase. The codepath for the metric emission was hit and produced the desired output, both for the tasks and the cronjob. Part of #4271
diff --git a/src/clusterfuzz/_internal/bot/tasks/impact_task.py b/src/clusterfuzz/_internal/bot/tasks/impact_task.py
@@ -18,6 +18,7 @@
 from clusterfuzz._internal.build_management import build_manager
 from clusterfuzz._internal.build_management import revisions
 from clusterfuzz._internal.chrome import build_info
+from clusterfuzz._internal.common import testcase_utils
 from clusterfuzz._internal.datastore import data_handler
 from clusterfuzz._internal.datastore import data_types
 from clusterfuzz._internal.metrics import logs
@@ -326,4 +327,7 @@ def execute_task(testcase_id, job_type):
   impacts = get_impacts_from_url(testcase.regression, testcase.job_type)
   testcase = data_handler.get_testcase_by_id(testcase_id)
   set_testcase_with_impacts(testcase, impacts)
+  testcase_utils.emit_testcase_triage_duration_metric(
+      testcase_id,
+      testcase_utils.TESTCASE_TRIAGE_DURATION_IMPACT_COMPLETED_STEP)
   data_handler.update_testcase_comment(testcase, data_types.TaskState.FINISHED)
diff --git a/src/clusterfuzz/_internal/bot/tasks/utasks/analyze_task.py b/src/clusterfuzz/_internal/bot/tasks/utasks/analyze_task.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 """Analyze task for handling user uploads."""
 
-import datetime
 import json
 from typing import Dict
 from typing import Optional
@@ -28,6 +27,7 @@
 from clusterfuzz._internal.bot.tasks.utasks import uworker_io
 from clusterfuzz._internal.build_management import build_manager
 from clusterfuzz._internal.build_management import revisions
+from clusterfuzz._internal.common import testcase_utils
 from clusterfuzz._internal.crash_analysis import crash_analyzer
 from clusterfuzz._internal.crash_analysis import severity_analyzer
 from clusterfuzz._internal.datastore import data_handler
@@ -119,7 +119,7 @@ def handle_analyze_no_revision_index(output):
 
 def handle_analyze_close_invalid_uploaded(output):
   testcase = data_handler.get_testcase_by_id(output.uworker_input.testcase_id)
-  testcase_upload_metadata = query_testcase_upload_metadata(
+  testcase_upload_metadata = testcase_utils.get_testcase_upload_metadata(
       output.uworker_input.testcase_id)
   data_handler.close_invalid_uploaded_testcase(
       testcase, testcase_upload_metadata, 'Irrelevant')
@@ -259,7 +259,7 @@ def handle_noncrash(output):
     tasks.add_task('analyze', output.uworker_input.testcase_id,
                    output.uworker_input.job_type)
     return
-  testcase_upload_metadata = query_testcase_upload_metadata(
+  testcase_upload_metadata = testcase_utils.get_testcase_upload_metadata(
       output.uworker_input.testcase_id)
   data_handler.mark_invalid_uploaded_testcase(
       testcase, testcase_upload_metadata, 'Unreproducible')
@@ -299,17 +299,24 @@ def utask_preprocess(testcase_id, job_type, uworker_env):
   testcase = data_handler.get_testcase_by_id(testcase_id)
   data_handler.update_testcase_comment(testcase, data_types.TaskState.STARTED)
 
-  testcase_upload_metadata = query_testcase_upload_metadata(testcase_id)
+  testcase_upload_metadata = testcase_utils.get_testcase_upload_metadata(
+      testcase_id)
   if not testcase_upload_metadata:
     logs.error('Testcase %s has no associated upload metadata.' % testcase_id)
     testcase.key.delete()
     return None
 
   # Store the bot name and timestamp in upload metadata.
   testcase_upload_metadata.bot_name = environment.get_value('BOT_NAME')
-  testcase_upload_metadata.timestamp = datetime.datetime.utcnow()
   testcase_upload_metadata.put()
 
+  # Emmits a TESTCASE_TRIAGE_DURATION metric, in order to track the time
+  # elapsed between testcase upload and pulling the task from the queue.
+
+  testcase_utils.emit_testcase_triage_duration_metric(
+      int(testcase_id),
+      testcase_utils.TESTCASE_TRIAGE_DURATION_ANALYZE_LAUNCHED_STEP)
+
   initialize_testcase_for_main(testcase, job_type)
 
   setup_input = setup.preprocess_setup_testcase(testcase, uworker_env)
@@ -481,7 +488,7 @@ def handle_build_setup_error(output):
         output.uworker_input.job_type,
         wait_time=testcase_fail_wait)
     return
-  testcase_upload_metadata = query_testcase_upload_metadata(
+  testcase_upload_metadata = testcase_utils.get_testcase_upload_metadata(
       output.uworker_input.testcase_id)
   data_handler.mark_invalid_uploaded_testcase(
       testcase, testcase_upload_metadata, 'Build setup failed')
@@ -552,13 +559,16 @@ def _update_testcase(output):
 def utask_postprocess(output):
   """Trusted: Cleans up after a uworker execute_task, writing anything needed to
   the db."""
+  testcase = data_handler.get_testcase_by_id(output.uworker_input.testcase_id)
+  testcase_upload_metadata = testcase_utils.get_testcase_upload_metadata(
+      output.uworker_input.testcase_id)
+  testcase_utils.emit_testcase_triage_duration_metric(
+      int(output.uworker_input.testcase_id),
+      testcase_utils.TESTCASE_TRIAGE_DURATION_ANALYZE_COMPLETED_STEP)
   _update_testcase(output)
   if output.error_type != uworker_msg_pb2.ErrorType.NO_ERROR:  # pylint: disable=no-member
     _ERROR_HANDLER.handle(output)
     return
-  testcase = data_handler.get_testcase_by_id(output.uworker_input.testcase_id)
-  testcase_upload_metadata = query_testcase_upload_metadata(
-      output.uworker_input.testcase_id)
 
   log_message = (f'Testcase crashed in {output.test_timeout} seconds '
                  f'(r{testcase.crash_revision})')
@@ -612,9 +622,3 @@ def utask_postprocess(output):
   # 5. Get second stacktrace from another job in case of
   #    one-time crashes (stack).
   task_creation.create_tasks(testcase)
-
-
-def query_testcase_upload_metadata(
-    testcase_id: str) -> Optional[data_types.TestcaseUploadMetadata]:
-  return data_types.TestcaseUploadMetadata.query(
-      data_types.TestcaseUploadMetadata.testcase_id == int(testcase_id)).get()
diff --git a/src/clusterfuzz/_internal/bot/tasks/utasks/minimize_task.py b/src/clusterfuzz/_internal/bot/tasks/utasks/minimize_task.py
@@ -44,6 +44,7 @@
 from clusterfuzz._internal.bot.tokenizer.grammars.JavaScriptLexer import \
     JavaScriptLexer
 from clusterfuzz._internal.build_management import build_manager
+from clusterfuzz._internal.common import testcase_utils
 from clusterfuzz._internal.crash_analysis import severity_analyzer
 from clusterfuzz._internal.crash_analysis.crash_comparer import CrashComparer
 from clusterfuzz._internal.crash_analysis.crash_result import CrashResult
@@ -834,6 +835,9 @@ def finalize_testcase(testcase_id, last_crash_result_dict, flaky_stack=False):
 
 def utask_postprocess(output):
   """Postprocess in a trusted bot."""
+  testcase_utils.emit_testcase_triage_duration_metric(
+      int(output.uworker_input.testcase_id),
+      testcase_utils.TESTCASE_TRIAGE_DURATION_MINIMIZE_COMPLETED_STEP)
   update_testcase(output)
   _cleanup_unused_blobs_from_storage(output)
   if output.error_type != uworker_msg_pb2.ErrorType.NO_ERROR:  # pylint: disable=no-member
diff --git a/src/clusterfuzz/_internal/bot/tasks/utasks/regression_task.py b/src/clusterfuzz/_internal/bot/tasks/utasks/regression_task.py
@@ -28,6 +28,7 @@
 from clusterfuzz._internal.bot.tasks.utasks import uworker_io
 from clusterfuzz._internal.build_management import build_manager
 from clusterfuzz._internal.build_management import revisions
+from clusterfuzz._internal.common import testcase_utils
 from clusterfuzz._internal.datastore import data_handler
 from clusterfuzz._internal.datastore import data_types
 from clusterfuzz._internal.google_cloud_utils import big_query
@@ -583,12 +584,16 @@ def utask_postprocess(output: uworker_msg_pb2.Output) -> None:  # pylint: disabl
 
   Runs on a trusted worker.
   """
+  testcase_id = output.uworker_input.testcase_id
+  testcase_utils.emit_testcase_triage_duration_metric(
+      int(testcase_id),
+      testcase_utils.TESTCASE_TRIAGE_DURATION_REGRESSION_COMPLETED_STEP)
+
   if output.HasField('regression_task_output'):
     task_output = output.regression_task_output
     _update_build_metadata(output.uworker_input.job_type,
                            task_output.build_data_list)
-    _save_current_regression_range_indices(task_output,
-                                           output.uworker_input.testcase_id)
+    _save_current_regression_range_indices(task_output, testcase_id)
     if task_output.is_testcase_reproducible:
       # Clear metadata from previous runs had it been marked as potentially
       # flaky.
diff --git a/src/clusterfuzz/_internal/common/testcase_utils.py b/src/clusterfuzz/_internal/common/testcase_utils.py
@@ -0,0 +1,75 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Holds helpers for reuse across different tasks."""
+
+import datetime
+import os
+from typing import Optional
+
+from clusterfuzz._internal.datastore import data_handler
+from clusterfuzz._internal.datastore import data_types
+from clusterfuzz._internal.metrics import logs
+from clusterfuzz._internal.metrics import monitoring_metrics
+
+TESTCASE_TRIAGE_DURATION_ANALYZE_LAUNCHED_STEP = 'analyze_launched'
+TESTCASE_TRIAGE_DURATION_IMPACT_COMPLETED_STEP = 'impact_completed'
+TESTCASE_TRIAGE_DURATION_ANALYZE_COMPLETED_STEP = 'analyze_completed'
+TESTCASE_TRIAGE_DURATION_MINIMIZE_COMPLETED_STEP = 'minimize_completed'
+TESTCASE_TRIAGE_DURATION_REGRESSION_COMPLETED_STEP = 'regression_completed'
+TESTCASE_TRIAGE_DURATION_ISSUE_UPDATED_STEP = 'issue_updated'
+
+
+def emit_testcase_triage_duration_metric(testcase_id: int, step: str):
+  testcase_upload_metadata = get_testcase_upload_metadata(testcase_id)
+  if not testcase_upload_metadata:
+    logs.warning(f'No upload metadata found for testcase {testcase_id},'
+                 ' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
+    return
+  if not testcase_upload_metadata.timestamp:
+    logs.warning(
+        f'No timestamp for testcase {testcase_upload_metadata.testcase_id},'
+        ' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
+    return
+  assert step in [
+      'analyze_launched', 'analyze_completed', 'minimize_completed',
+      'regression_completed', 'impact_completed', 'issue_updated'
+  ]
+  elapsed_time_since_upload = datetime.datetime.utcnow()
+  elapsed_time_since_upload -= testcase_upload_metadata.timestamp
+  elapsed_time_since_upload = elapsed_time_since_upload.total_seconds()
+
+  testcase = data_handler.get_testcase_by_id(testcase_id)
+
+  if not testcase:
+    logs.warning(f'No testcase found with id {testcase_id},'
+                 ' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
+    return
+
+  if not testcase.job_type:
+    logs.warning(f'No job_type associated to testcase {testcase_id},'
+                 ' failed to emit TESTCASE_UPLOAD_TRIAGE_DURATION metric.')
+    return
+
+  monitoring_metrics.TESTCASE_UPLOAD_TRIAGE_DURATION.add(
+      elapsed_time_since_upload,
+      labels={
+          'job': testcase.job_type,
+          'step': step,
+      })
+
+
+def get_testcase_upload_metadata(
+    testcase_id) -> Optional[data_types.TestcaseUploadMetadata]:
+  return data_types.TestcaseUploadMetadata.query(
+      data_types.TestcaseUploadMetadata.testcase_id == int(testcase_id)).get()
diff --git a/src/clusterfuzz/_internal/cron/cleanup.py b/src/clusterfuzz/_internal/cron/cleanup.py
@@ -24,6 +24,7 @@
 from clusterfuzz._internal.base import memoize
 from clusterfuzz._internal.base import utils
 from clusterfuzz._internal.chrome import build_info
+from clusterfuzz._internal.common import testcase_utils
 from clusterfuzz._internal.crash_analysis import crash_comparer
 from clusterfuzz._internal.crash_analysis import severity_analyzer
 from clusterfuzz._internal.cron.libs import mail
@@ -911,6 +912,11 @@ def _update_issue_when_uploaded_testcase_is_processed(
       policy, testcase, issue)
   issue.save(new_comment=comment, notify=notify)
 
+  # Testcase is a data_types.Testcase
+  testcase_id = testcase.key.id()
+  testcase_utils.emit_testcase_triage_duration_metric(
+      testcase_id, testcase_utils.TESTCASE_TRIAGE_DURATION_ISSUE_UPDATED_STEP)
+
 
 def notify_uploader_when_testcase_is_processed(policy, testcase, issue):
   """Notify uploader by email when all the testcase tasks are finished."""
diff --git a/src/clusterfuzz/_internal/metrics/monitoring_metrics.py b/src/clusterfuzz/_internal/metrics/monitoring_metrics.py
@@ -207,6 +207,16 @@
     ],
 )
 
+TESTCASE_UPLOAD_TRIAGE_DURATION = monitor.CumulativeDistributionMetric(
+    'uploaded_testcase_analysis/triage_duration_secs',
+    description=('Time elapsed between testcase upload and completion'
+                 ' of relevant tasks in the testcase upload lifecycle.'),
+    bucketer=monitor.GeometricBucketer(),
+    field_spec=[
+        monitor.StringField('step'),
+        monitor.StringField('job'),
+    ],
+)
 TASK_RATE_LIMIT_COUNT = monitor.CounterMetric(
     'task/rate_limit',
     description=('Counter for rate limit events.'),