feat(executors): add workflow specific metrics to otel

AndreyMarkinPPC · AndreyMarkinPPC · commit 762d1958cf2e · 2026-05-13T14:22:12.000+04:00
* track workflow execution count and duration
* track each step of workflow execution count and duration
* export garf build info as a metric
* add support for simulation in server endpoint
diff --git a/libs/executors/garf/executors/entrypoints/server.py b/libs/executors/garf/executors/entrypoints/server.py
@@ -28,6 +28,7 @@
 import typer
 import uvicorn
 import yaml
+from garf.executors import telemetry
 from garf.executors.entrypoints import tasks, utils
 from garf.executors.entrypoints.tracer import (
   initialize_logger,
@@ -158,6 +159,7 @@ def execute_workflow(
   config_path: str | pathlib.Path = None,
   selected_aliases: Optional[list[str]] = None,
   skipped_aliases: Optional[list[str]] = None,
+  simulate: bool = False,
 ) -> list[str]:
   """Runs garf workflow till completion."""
   try:
@@ -170,6 +172,7 @@ def execute_workflow(
     execution_workflow=execution_workflow.model_dump(),
     selected_aliases=selected_aliases,
     skipped_aliases=skipped_aliases,
+    simulate=simulate,
   )
 
 
@@ -252,7 +255,7 @@ def _init_workflow(
   else:
     raise workflow.GarfWorkflowError('Neither workflow path nor file provided')
   return workflow.Workflow(
-    steps=workflow_data.get('steps'),
+    **workflow_data,
     execution_config=config_data,
   )
 
@@ -266,6 +269,14 @@ def main(
     int, typer.Option('--port', '-p', help='Port to start the server')
   ] = 8000,
 ):
+  telemetry.executor_info.set(
+    1,
+    {
+      'version_executors': garf.executors.__version__,
+      'version_core': garf.core.__version__,
+      'version_io': garf.io.__version__,
+    },
+  )
   uvicorn.run(app, host=host, port=port, log_config=None)
 
 
diff --git a/libs/executors/garf/executors/entrypoints/tasks.py b/libs/executors/garf/executors/entrypoints/tasks.py
@@ -129,11 +129,16 @@ def execute_workflow(
   execution_workflow: workflow.Workflow,
   selected_aliases: list[str],
   skipped_aliases: list[str],
+  simulate: bool,
 ):
   """Executes a batch of queries."""
   return workflow_runner.WorkflowRunner(
     execution_workflow=execution_workflow
-  ).run(selected_aliases=selected_aliases, skipped_aliases=skipped_aliases)
+  ).run(
+    selected_aliases=selected_aliases,
+    skipped_aliases=skipped_aliases,
+    simulate=simulate,
+  )
 
 
 @app.task(pydantic=True)
diff --git a/libs/executors/garf/executors/executor.py b/libs/executors/garf/executors/executor.py
@@ -21,7 +21,12 @@
 from typing import Optional
 
 from garf.core import query_editor, report, report_fetcher
-from garf.executors import execution_context, query_processor, telemetry
+from garf.executors import (
+  exceptions,
+  execution_context,
+  query_processor,
+  telemetry,
+)
 from garf.executors.telemetry import tracer
 from garf.io.writers import abs_writer
 from opentelemetry import trace
@@ -78,7 +83,11 @@ def execute(
       context = query_processor.process_gquery(context)
     if self.preprocessors:
       _handle_processors(processors=self.preprocessors, context=context)
-    results = self._execute(query=query_text, title=title, context=context)
+    try:
+      results = self._execute(query=query_text, title=title, context=context)
+    except exceptions.GarfExecutorError as e:
+      telemetry.executor_error_counter.add(1, executor_attributes)
+      raise e
     if hasattr(self, 'fetcher'):
       fetcher_attributes = {
         'api.client.class': self.fetcher.api_client.__class__.__name__
diff --git a/libs/executors/garf/executors/telemetry.py b/libs/executors/garf/executors/telemetry.py
@@ -20,12 +20,24 @@
 )
 meter = metrics.get_meter('garf.executors')
 
+executor_info = meter.create_gauge(
+  'garf_info',
+  unit='',
+  description='Build info of garf executor',
+)
+
 executor_counter = meter.create_counter(
   'garf_execute_total',
   unit='1',
   description='Counts number of executor invocations',
 )
 
+executor_error_counter = meter.create_counter(
+  'garf_execute_errors_total',
+  unit='1',
+  description='Counts number of executor failures',
+)
+
 executor_histogram = meter.create_histogram(
   'garf_execute_duration_seconds',
   unit='s',
@@ -36,3 +48,39 @@
   unit='s',
   description='Measures report writes duration in seconds',
 )
+
+workflow_counter = meter.create_counter(
+  'garf_workflow_run_total',
+  unit='1',
+  description='Counts number of workflow runs',
+)
+
+workflow_histogram = meter.create_histogram(
+  'garf_workflow_run_duration_seconds',
+  unit='s',
+  description='Measures workflow run duration in seconds',
+)
+
+workflow_error_counter = meter.create_counter(
+  'garf_workflow_run_errors_total',
+  unit='1',
+  description='Counts number of workflow failures',
+)
+
+workflow_step_counter = meter.create_counter(
+  'garf_workflow_step_run_total',
+  unit='1',
+  description='Counts number of runs of workflow step',
+)
+
+workflow_step_histogram = meter.create_histogram(
+  'garf_workflow_step_run_duration_seconds',
+  unit='s',
+  description='Measures run duration in seconds of workflow step',
+)
+
+workflow_step_error_counter = meter.create_counter(
+  'garf_workflow_step_run_errors_total',
+  unit='1',
+  description='Counts number of workflow step failures',
+)
diff --git a/libs/executors/garf/executors/workflows/workflow_runner.py b/libs/executors/garf/executors/workflows/workflow_runner.py
@@ -17,12 +17,14 @@
 
 import logging
 import pathlib
+import time
 from typing import Final
 
 import yaml
-from garf.executors import exceptions, setup
+from garf.executors import exceptions, setup, telemetry
 from garf.executors.telemetry import tracer
 from garf.executors.workflows import workflow
+from opentelemetry import trace
 
 logger = logging.getLogger(__name__)
 
@@ -65,6 +67,7 @@ def from_file(
       execution_workflow=execution_workflow, wf_parent=workflow_file.parent
     )
 
+  @tracer.start_as_current_span('workflow.run')
   def run(
     self,
     enable_cache: bool = False,
@@ -73,13 +76,23 @@ def run(
     skipped_aliases: list[str] | None = None,
     simulate: bool = False,
   ) -> list[str]:
+    span = trace.get_current_span()
+    start_time = time.perf_counter()
+    workflow_attributes = {}
+    if name := self.workflow.name:
+      workflow_attributes.update({'workflow.name': name})
+    if version := self.workflow.metadata.version:
+      workflow_attributes.update({'workflow.version': version})
+    if workflow_attributes:
+      span.set_attributes(workflow_attributes)
     self.workflow.compile()
     skipped_aliases = skipped_aliases or []
     selected_aliases = selected_aliases or []
     execution_results = []
     logger.info('Starting Garf Workflow...')
     for i, step in enumerate(self.workflow.steps, 1):
       step_name = f'{i}-{step.fetcher}'
+
       if step.alias:
         step_name = f'{step_name}-{step.alias}'
       if step.alias in skipped_aliases:
@@ -98,6 +111,10 @@ def run(
           step.alias,
         )
         continue
+      workflow_step_attributes = {
+        **workflow_attributes,
+        'workflow.step.name': step_name,
+      }
       with tracer.start_as_current_span(step_name):
         logger.info(
           'Running step %d, fetcher: %s, alias: %s', i, step.fetcher, step.alias
@@ -123,13 +140,27 @@ def run(
               batch[q.title] = q.text
           else:
             batch[query.title] = query.text
-        query_executor.execute_batch(
-          batch,
-          step.context,
-          step.parallel_threshold or self.parallel_threshold,
-        )
-        execution_results.append(step_name)
+        try:
+          step_start_time = time.perf_counter()
+          query_executor.execute_batch(
+            batch,
+            step.context,
+            step.parallel_threshold or self.parallel_threshold,
+          )
+          execution_results.append(step_name)
+          telemetry.workflow_step_counter.add(1, workflow_step_attributes)
+          step_duration = time.perf_counter() - step_start_time
+          telemetry.workflow_step_histogram.record(
+            step_duration, workflow_step_attributes
+          )
+        except exceptions.GarfExecutorError as e:
+          telemetry.workflow_step_error_counter.add(1, workflow_step_attributes)
+          telemetry.workflow_error_counter.add(1, workflow_attributes)
+          raise e
     logger.info('Garf Workflow completed.')
+    telemetry.workflow_counter.add(1, workflow_attributes)
+    duration = time.perf_counter() - start_time
+    telemetry.workflow_histogram.record(duration, workflow_attributes)
     return execution_results
 
   def compile(self, path: str | pathlib.Path) -> str: