canonical · javierdelapuente · Apr 29, 2025 · Apr 23, 2025 · Apr 24, 2025 · Apr 24, 2025
@@ -5,7 +5,6 @@
 
 - Add a visualization of the share of jobs started per application.
 
-
 ### 2025-04-22
 
 - Add how-to landing page.

@@ -13,10 +13,6 @@ class RunnerCreateError(RunnerError):
     """Error for runner creation failure."""
 
 
-class RunnerStartError(RunnerError):
-    """Error for runner start failure."""
-
-
 class MissingServerConfigError(RunnerError):
     """Error for unable to create runner due to missing server configurations."""
 
@@ -46,7 +42,7 @@ class TokenError(PlatformClientError):
 
 
 class JobNotFoundError(PlatformClientError):
-    """Represents an error when the job could not be found on GitHub."""
+    """Represents an error when the job could not be found on the platform."""
 
 
 class CloudError(Exception):

@@ -13,6 +13,9 @@
 from urllib.error import HTTPError
 
 import requests
+
+# HTTP404NotFoundError is not found by pylint
+from fastcore.net import HTTP404NotFoundError  # pylint: disable=no-name-in-module
 from ghapi.all import GhApi, pages
 from ghapi.page import paged
 from requests import RequestException
@@ -29,6 +32,11 @@
 
 logger = logging.getLogger(__name__)
 
+
+class GithubRunnerNotFoundError(Exception):
+    """Represents an error when the runner could not be found on GitHub."""
+
+
 # Parameters of the function decorated with retry
 ParamT = ParamSpec("ParamT")
 # Return type of the function decorated with retry
@@ -89,8 +97,38 @@ def __init__(self, token: str):
         self._client = GhApi(token=self._token)
 
     @catch_http_errors
-    def get_runner_github_info(self, path: GitHubPath, prefix: str) -> list[SelfHostedRunner]:
-        """Get runner information on GitHub under a repo or org.
+    def get_runner(self, path: GitHubPath, prefix: str, runner_id: int) -> SelfHostedRunner:
+        """Get a specific self-hosted runner information under a repo or org.
+
+        Args:
+            path: GitHub repository path in the format '<owner>/<repo>', or the GitHub organization
+                name.
+            prefix: Build the InstanceID with this prefix.
+            runner_id: Runner id to get the self hosted runner.
+
+        Raises:
+            GithubRunnerNotFoundError: If the runner is not found.
+
+        Returns:
+            The information for the requested runner.
+        """
+        try:
+            if isinstance(path, GitHubRepo):
+                raw_runner = self._client.actions.get_self_hosted_runner_for_repo(
+                    path.owner, path.repo, runner_id
+                )
+            else:
+                raw_runner = self._client.actions.get_self_hosted_runner_for_org(
+                    path.org, runner_id
+                )
+        except HTTP404NotFoundError as err:
+            raise GithubRunnerNotFoundError from err
+        instance_id = InstanceID.build_from_name(prefix, raw_runner["name"])
+        return SelfHostedRunner.build_from_github(raw_runner, instance_id)
+
+    @catch_http_errors
+    def list_runners(self, path: GitHubPath, prefix: str) -> list[SelfHostedRunner]:
+        """Get all runners information on GitHub under a repo or org.
 
         Args:
             path: GitHub repository path in the format '<owner>/<repo>', or the GitHub organization

@@ -5,6 +5,7 @@
 
 import copy
 import logging
+import time
 from dataclasses import dataclass
 from enum import Enum, auto
 from multiprocessing import Pool
@@ -23,11 +24,19 @@
 from github_runner_manager.metrics import github as github_metrics
 from github_runner_manager.metrics import runner as runner_metrics
 from github_runner_manager.metrics.runner import RunnerMetrics
-from github_runner_manager.platform.platform_provider import PlatformProvider, PlatformRunnerState
+from github_runner_manager.platform.platform_provider import (
+    PlatformProvider,
+    PlatformRunnerState,
+)
 from github_runner_manager.types_.github import SelfHostedRunner
 
 logger = logging.getLogger(__name__)
 
+# After a runner is created, there will be as many health checks as
+# elements in this variable. The elements in the tuple represent
+# the time waiting before each health check against the platform provider.
+RUNNER_CREATION_WAITING_TIMES = (60, 60, 120, 240, 480)
+
 IssuedMetricEventsStats = dict[Type[metric_events.Event], int]
 
 
@@ -474,16 +483,59 @@ def _create_runner(args: _CreateRunnerArgs) -> InstanceID:
             args.metadata.runner_id = str(github_runner.id)
 
         try:
-            args.cloud_runner_manager.create_runner(
+            cloud_instance = args.cloud_runner_manager.create_runner(
                 instance_id=instance_id,
                 metadata=args.metadata,
                 runner_context=runner_context,
             )
+
+            # This wait should be deleted to make the runner creation as
+            # quick as possible. The waiting should only be done in the
+            # reactive case, before checking that a job was taken.
+            RunnerManager.wait_for_runner_online(
+                platform_provider=args.platform_provider,
+                instance_id=cloud_instance.instance_id,
+                metadata=cloud_instance.metadata,
+            )
+
         except RunnerError:
-            # try to clean the runner in GitHub. This is necessary, as for reactive runners
-            # we do not know in the clean up if the runner is offline because if failed or
-            # because it is being created.
-            logger.warning("Deleting runner %s from platform", instance_id)
+            logger.warning("Deleting runner %s from platform after creation failed", instance_id)
             args.platform_provider.delete_runners([github_runner])
             raise
         return instance_id
+
+    @staticmethod
+    def wait_for_runner_online(
+        platform_provider: PlatformProvider,
+        instance_id: InstanceID,
+        metadata: RunnerMetadata,
+    ) -> None:
+        """Wait until the runner is online.
+
+        The constant RUNNER_CREATION_WAITING_TIMES defines the time before calling
+        the platform provider to check if the runner is online. Besides online runner,
+        deletable runner will also be equivalent to online, as no more waiting should
+        be needed.
+
+        Args:
+            platform_provider: Platform provider to use for health checks.
+            instance_id: InstanceID for the runner to wait for.
+            metadata: Metadata for the runner to wait for.
+
+        Raises:
+            RunnerError: If the runner did not come online after the specified time.
+
+        """
+        for wait_time in RUNNER_CREATION_WAITING_TIMES:
+            time.sleep(wait_time)
+            try:
+                runner_health = platform_provider.get_runner_health(
+                    metadata=metadata, instance_id=instance_id
+                )
+            except PlatformApiError as exc:
+                logger.error("Error getting the runner health: %s", exc)
+                continue
+            if runner_health.online or runner_health.deletable:
+                break
+        else:
+            raise RunnerError(f"Runner {instance_id} did not get online")
@@ -10,7 +10,6 @@
 from typing import Iterable, Sequence
 
 import fabric
-import invoke
 import jinja2
 import paramiko
 from fabric import Connection as SSHConnection
@@ -22,11 +21,9 @@
     OpenStackError,
     OpenstackHealthCheckError,
     RunnerCreateError,
-    RunnerStartError,
     SSHError,
 )
 from github_runner_manager.manager.cloud_runner_manager import (
-    CloudInitStatus,
     CloudRunnerInstance,
     CloudRunnerManager,
     CloudRunnerState,
@@ -158,11 +155,6 @@ def create_runner(
         except OpenStackError as err:
             raise RunnerCreateError(f"Failed to create {instance_id} openstack runner") from err
 
-        logger.info("Waiting for runner process to startup: %s", instance.instance_id)
-        self._wait_runner_startup(instance)
-        logger.info("Waiting for runner process to be running: %s", instance.instance_id)
-        self._wait_runner_running(instance)
-
         logger.info("Runner %s created successfully", instance.instance_id)
         return self._build_cloud_runner_instance(instance)
 
@@ -508,80 +500,6 @@ def _check_state_and_flush(self, instance: OpenstackInstance, busy: bool) -> Non
             result.stderr,
         )
 
-    @retry(tries=10, delay=60, local_logger=logger)
-    def _wait_runner_startup(self, instance: OpenstackInstance) -> None:
-        """Wait until runner is startup.
-
-        Args:
-            instance: The runner instance.
-
-        Raises:
-            RunnerStartError: The runner startup process was not found on the runner.
-        """
-        try:
-            ssh_conn = self._openstack_cloud.get_ssh_connection(instance)
-        except SSHError as err:
-            raise RunnerStartError(
-                f"Failed to SSH to {instance.instance_id} during creation possible due to setup "
-                "not completed"
-            ) from err
-
-        logger.debug("Running `cloud-init status` on instance %s.", instance.instance_id)
-        result: invoke.runners.Result = ssh_conn.run("cloud-init status", warn=True, timeout=60)
-        if not result.ok:
-            logger.warning(
-                "cloud-init status command failed on %s: %s.", instance.instance_id, result.stderr
-            )
-            raise RunnerStartError(f"Runner startup process not found on {instance.instance_id}")
-        # A short running job may have already completed and exited the runner, hence check the
-        # condition via cloud-init status check.
-        if CloudInitStatus.DONE in result.stdout:
-            return
-        logger.debug("Running `ps aux` on instance %s.", instance.instance_id)
-        result = ssh_conn.run("ps aux", warn=True, timeout=60, hide=True)
-        if not result.ok:
-            logger.warning("SSH run of `ps aux` failed on %s", instance.instance_id)
-            raise RunnerStartError(f"Unable to SSH run `ps aux` on {instance.instance_id}")
-        # Runner startup process is the parent process of runner.Listener and runner.Worker which
-        # starts up much faster.
-        if RUNNER_STARTUP_PROCESS not in result.stdout:
-            logger.warning("Runner startup process not found on %s", instance.instance_id)
-            raise RunnerStartError(f"Runner startup process not found on {instance.instance_id}")
-        logger.info("Runner startup process found to be healthy on %s", instance.instance_id)
-
-    @retry(tries=5, delay=60, local_logger=logger)
-    def _wait_runner_running(self, instance: OpenstackInstance) -> None:
-        """Wait until runner is running.
-
-        Args:
-            instance: The runner instance.
-
-        Raises:
-            RunnerStartError: The runner process was not found on the runner.
-        """
-        try:
-            ssh_conn = self._openstack_cloud.get_ssh_connection(instance)
-        except SSHError as err:
-            raise RunnerStartError(
-                f"Failed to SSH connect to {instance.instance_id} openstack runner"
-            ) from err
-
-        try:
-            healthy = health_checks.check_active_runner(
-                ssh_conn=ssh_conn, instance=instance, accept_finished_job=True
-            )
-        except OpenstackHealthCheckError as exc:
-            raise RunnerStartError(
-                f"Failed to check health of runner process on {instance.instance_id}"
-            ) from exc
-        if not healthy:
-            logger.info("Runner %s not considered healthy", instance.instance_id)
-            raise RunnerStartError(
-                f"Runner {instance.instance_id} failed to initialize after starting"
-            )
-
-        logger.info("Runner %s found to be healthy", instance.instance_id)
-
     @staticmethod
     def _run_runner_removal_script(
         instance_id: InstanceID, ssh_conn: SSHConnection, remove_token: str

@@ -11,15 +11,16 @@
 
 from github_runner_manager.configuration.github import GitHubConfiguration, GitHubRepo
 from github_runner_manager.errors import JobNotFoundError as GithubJobNotFoundError
-from github_runner_manager.github_client import GithubClient
+from github_runner_manager.github_client import GithubClient, GithubRunnerNotFoundError
 from github_runner_manager.manager.models import InstanceID, RunnerContext, RunnerMetadata
 from github_runner_manager.platform.platform_provider import (
     JobInfo,
     JobNotFoundError,
     PlatformProvider,
+    PlatformRunnerHealth,
     PlatformRunnerState,
 )
-from github_runner_manager.types_.github import SelfHostedRunner
+from github_runner_manager.types_.github import GitHubRunnerStatus, SelfHostedRunner
 
 logger = logging.getLogger(__name__)
 
@@ -58,6 +59,40 @@ def build(
             github_client=GithubClient(github_configuration.token),
         )
 
+    def get_runner_health(
+        self,
+        metadata: RunnerMetadata,
+        instance_id: InstanceID,
+    ) -> PlatformRunnerHealth:
+        """Get information on the health of a github runner.
+
+        Args:
+            metadata: Metadata for the runner.
+            instance_id: Instance ID of the runner.
+
+        Returns:
+            Information about the health status of the runner.
+        """
+        try:
+            runner = self._client.get_runner(self._path, self._prefix, int(metadata.runner_id))
+            online = runner.status == GitHubRunnerStatus.ONLINE
+            return PlatformRunnerHealth(
+                instance_id=instance_id,
+                metadata=metadata,
+                online=online,
+                busy=runner.busy,
+                deletable=False,
+            )
+
+        except GithubRunnerNotFoundError:
+            return PlatformRunnerHealth(
+                instance_id=instance_id,
+                metadata=metadata,
+                online=False,
+                busy=False,
+                deletable=True,
+            )
+
     def get_runners(
         self, states: Iterable[PlatformRunnerState] | None = None
     ) -> tuple[SelfHostedRunner, ...]:
@@ -69,7 +104,7 @@ def get_runners(
         Returns:
             Information on the runners.
         """
-        runner_list = self._client.get_runner_github_info(self._path, self._prefix)
+        runner_list = self._client.list_runners(self._path, self._prefix)
 
         if states is None:
             return tuple(runner_list)
Original file line number	Diff line number	Diff line change
Expand Up		@@ -5,7 +5,6 @@

		- Add a visualization of the share of jobs started per application.


		### 2025-04-22

		- Add how-to landing page.
Expand Down