Skip to content

Remove SSH creation health checks #534

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
Apr 29, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@

- Add a visualization of the share of jobs started per application.


### 2025-04-22

- Add how-to landing page.
Expand Down
6 changes: 1 addition & 5 deletions github-runner-manager/src/github_runner_manager/errors.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,6 @@ class RunnerCreateError(RunnerError):
"""Error for runner creation failure."""


class RunnerStartError(RunnerError):
"""Error for runner start failure."""


class MissingServerConfigError(RunnerError):
"""Error for unable to create runner due to missing server configurations."""

Expand Down Expand Up @@ -46,7 +42,7 @@ class TokenError(PlatformClientError):


class JobNotFoundError(PlatformClientError):
"""Represents an error when the job could not be found on GitHub."""
"""Represents an error when the job could not be found on the platform."""


class CloudError(Exception):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@
from urllib.error import HTTPError

import requests

# HTTP404NotFoundError is not found by pylint
from fastcore.net import HTTP404NotFoundError # pylint: disable=no-name-in-module
from ghapi.all import GhApi, pages
from ghapi.page import paged
from requests import RequestException
Expand All @@ -29,6 +32,11 @@

logger = logging.getLogger(__name__)


class GithubRunnerNotFoundError(Exception):
"""Represents an error when the runner could not be found on GitHub."""


# Parameters of the function decorated with retry
ParamT = ParamSpec("ParamT")
# Return type of the function decorated with retry
Expand Down Expand Up @@ -89,8 +97,38 @@ def __init__(self, token: str):
self._client = GhApi(token=self._token)

@catch_http_errors
def get_runner_github_info(self, path: GitHubPath, prefix: str) -> list[SelfHostedRunner]:
"""Get runner information on GitHub under a repo or org.
def get_runner(self, path: GitHubPath, prefix: str, runner_id: int) -> SelfHostedRunner:
"""Get a specific self-hosted runner information under a repo or org.

Args:
path: GitHub repository path in the format '<owner>/<repo>', or the GitHub organization
name.
prefix: Build the InstanceID with this prefix.
runner_id: Runner id to get the self hosted runner.

Raises:
GithubRunnerNotFoundError: If the runner is not found.

Returns:
The information for the requested runner.
"""
try:
if isinstance(path, GitHubRepo):
raw_runner = self._client.actions.get_self_hosted_runner_for_repo(
path.owner, path.repo, runner_id
)
else:
raw_runner = self._client.actions.get_self_hosted_runner_for_org(
path.org, runner_id
)
except HTTP404NotFoundError as err:
raise GithubRunnerNotFoundError from err
instance_id = InstanceID.build_from_name(prefix, raw_runner["name"])
return SelfHostedRunner.build_from_github(raw_runner, instance_id)

@catch_http_errors
def list_runners(self, path: GitHubPath, prefix: str) -> list[SelfHostedRunner]:
"""Get all runners information on GitHub under a repo or org.

Args:
path: GitHub repository path in the format '<owner>/<repo>', or the GitHub organization
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import copy
import logging
import time
from dataclasses import dataclass
from enum import Enum, auto
from multiprocessing import Pool
Expand All @@ -23,11 +24,19 @@
from github_runner_manager.metrics import github as github_metrics
from github_runner_manager.metrics import runner as runner_metrics
from github_runner_manager.metrics.runner import RunnerMetrics
from github_runner_manager.platform.platform_provider import PlatformProvider, PlatformRunnerState
from github_runner_manager.platform.platform_provider import (
PlatformProvider,
PlatformRunnerState,
)
from github_runner_manager.types_.github import SelfHostedRunner

logger = logging.getLogger(__name__)

# After a runner is created, there will be as many health checks as
# elements in this variable. The elements in the tuple represent
# the time waiting before each health check against the platform provider.
RUNNER_CREATION_WAITING_TIMES = (60, 60, 120, 240, 480)

IssuedMetricEventsStats = dict[Type[metric_events.Event], int]


Expand Down Expand Up @@ -474,16 +483,59 @@ def _create_runner(args: _CreateRunnerArgs) -> InstanceID:
args.metadata.runner_id = str(github_runner.id)

try:
args.cloud_runner_manager.create_runner(
cloud_instance = args.cloud_runner_manager.create_runner(
instance_id=instance_id,
metadata=args.metadata,
runner_context=runner_context,
)

# This wait should be deleted to make the runner creation as
# quick as possible. The waiting should only be done in the
# reactive case, before checking that a job was taken.
RunnerManager.wait_for_runner_online(
platform_provider=args.platform_provider,
instance_id=cloud_instance.instance_id,
metadata=cloud_instance.metadata,
)

except RunnerError:
# try to clean the runner in GitHub. This is necessary, as for reactive runners
# we do not know in the clean up if the runner is offline because if failed or
# because it is being created.
logger.warning("Deleting runner %s from platform", instance_id)
logger.warning("Deleting runner %s from platform after creation failed", instance_id)
args.platform_provider.delete_runners([github_runner])
raise
return instance_id

@staticmethod
def wait_for_runner_online(
platform_provider: PlatformProvider,
instance_id: InstanceID,
metadata: RunnerMetadata,
) -> None:
"""Wait until the runner is online.

The constant RUNNER_CREATION_WAITING_TIMES defines the time before calling
the platform provider to check if the runner is online. Besides online runner,
deletable runner will also be equivalent to online, as no more waiting should
be needed.

Args:
platform_provider: Platform provider to use for health checks.
instance_id: InstanceID for the runner to wait for.
metadata: Metadata for the runner to wait for.

Raises:
RunnerError: If the runner did not come online after the specified time.

"""
for wait_time in RUNNER_CREATION_WAITING_TIMES:
time.sleep(wait_time)
try:
runner_health = platform_provider.get_runner_health(
metadata=metadata, instance_id=instance_id
)
except PlatformApiError as exc:
logger.error("Error getting the runner health: %s", exc)
continue
if runner_health.online or runner_health.deletable:
break
else:
raise RunnerError(f"Runner {instance_id} did not get online")
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from typing import Iterable, Sequence

import fabric
import invoke
import jinja2
import paramiko
from fabric import Connection as SSHConnection
Expand All @@ -22,11 +21,9 @@
OpenStackError,
OpenstackHealthCheckError,
RunnerCreateError,
RunnerStartError,
SSHError,
)
from github_runner_manager.manager.cloud_runner_manager import (
CloudInitStatus,
CloudRunnerInstance,
CloudRunnerManager,
CloudRunnerState,
Expand Down Expand Up @@ -158,11 +155,6 @@ def create_runner(
except OpenStackError as err:
raise RunnerCreateError(f"Failed to create {instance_id} openstack runner") from err

logger.info("Waiting for runner process to startup: %s", instance.instance_id)
self._wait_runner_startup(instance)
logger.info("Waiting for runner process to be running: %s", instance.instance_id)
self._wait_runner_running(instance)

logger.info("Runner %s created successfully", instance.instance_id)
return self._build_cloud_runner_instance(instance)

Expand Down Expand Up @@ -508,80 +500,6 @@ def _check_state_and_flush(self, instance: OpenstackInstance, busy: bool) -> Non
result.stderr,
)

@retry(tries=10, delay=60, local_logger=logger)
def _wait_runner_startup(self, instance: OpenstackInstance) -> None:
"""Wait until runner is startup.

Args:
instance: The runner instance.

Raises:
RunnerStartError: The runner startup process was not found on the runner.
"""
try:
ssh_conn = self._openstack_cloud.get_ssh_connection(instance)
except SSHError as err:
raise RunnerStartError(
f"Failed to SSH to {instance.instance_id} during creation possible due to setup "
"not completed"
) from err

logger.debug("Running `cloud-init status` on instance %s.", instance.instance_id)
result: invoke.runners.Result = ssh_conn.run("cloud-init status", warn=True, timeout=60)
if not result.ok:
logger.warning(
"cloud-init status command failed on %s: %s.", instance.instance_id, result.stderr
)
raise RunnerStartError(f"Runner startup process not found on {instance.instance_id}")
# A short running job may have already completed and exited the runner, hence check the
# condition via cloud-init status check.
if CloudInitStatus.DONE in result.stdout:
return
logger.debug("Running `ps aux` on instance %s.", instance.instance_id)
result = ssh_conn.run("ps aux", warn=True, timeout=60, hide=True)
if not result.ok:
logger.warning("SSH run of `ps aux` failed on %s", instance.instance_id)
raise RunnerStartError(f"Unable to SSH run `ps aux` on {instance.instance_id}")
# Runner startup process is the parent process of runner.Listener and runner.Worker which
# starts up much faster.
if RUNNER_STARTUP_PROCESS not in result.stdout:
logger.warning("Runner startup process not found on %s", instance.instance_id)
raise RunnerStartError(f"Runner startup process not found on {instance.instance_id}")
logger.info("Runner startup process found to be healthy on %s", instance.instance_id)

@retry(tries=5, delay=60, local_logger=logger)
def _wait_runner_running(self, instance: OpenstackInstance) -> None:
"""Wait until runner is running.

Args:
instance: The runner instance.

Raises:
RunnerStartError: The runner process was not found on the runner.
"""
try:
ssh_conn = self._openstack_cloud.get_ssh_connection(instance)
except SSHError as err:
raise RunnerStartError(
f"Failed to SSH connect to {instance.instance_id} openstack runner"
) from err

try:
healthy = health_checks.check_active_runner(
ssh_conn=ssh_conn, instance=instance, accept_finished_job=True
)
except OpenstackHealthCheckError as exc:
raise RunnerStartError(
f"Failed to check health of runner process on {instance.instance_id}"
) from exc
if not healthy:
logger.info("Runner %s not considered healthy", instance.instance_id)
raise RunnerStartError(
f"Runner {instance.instance_id} failed to initialize after starting"
)

logger.info("Runner %s found to be healthy", instance.instance_id)

@staticmethod
def _run_runner_removal_script(
instance_id: InstanceID, ssh_conn: SSHConnection, remove_token: str
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,16 @@

from github_runner_manager.configuration.github import GitHubConfiguration, GitHubRepo
from github_runner_manager.errors import JobNotFoundError as GithubJobNotFoundError
from github_runner_manager.github_client import GithubClient
from github_runner_manager.github_client import GithubClient, GithubRunnerNotFoundError
from github_runner_manager.manager.models import InstanceID, RunnerContext, RunnerMetadata
from github_runner_manager.platform.platform_provider import (
JobInfo,
JobNotFoundError,
PlatformProvider,
PlatformRunnerHealth,
PlatformRunnerState,
)
from github_runner_manager.types_.github import SelfHostedRunner
from github_runner_manager.types_.github import GitHubRunnerStatus, SelfHostedRunner

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -58,6 +59,40 @@ def build(
github_client=GithubClient(github_configuration.token),
)

def get_runner_health(
self,
metadata: RunnerMetadata,
instance_id: InstanceID,
) -> PlatformRunnerHealth:
"""Get information on the health of a github runner.

Args:
metadata: Metadata for the runner.
instance_id: Instance ID of the runner.

Returns:
Information about the health status of the runner.
"""
try:
runner = self._client.get_runner(self._path, self._prefix, int(metadata.runner_id))
online = runner.status == GitHubRunnerStatus.ONLINE
return PlatformRunnerHealth(
instance_id=instance_id,
metadata=metadata,
online=online,
busy=runner.busy,
deletable=False,
)

except GithubRunnerNotFoundError:
return PlatformRunnerHealth(
instance_id=instance_id,
metadata=metadata,
online=False,
busy=False,
deletable=True,
)

def get_runners(
self, states: Iterable[PlatformRunnerState] | None = None
) -> tuple[SelfHostedRunner, ...]:
Expand All @@ -69,7 +104,7 @@ def get_runners(
Returns:
Information on the runners.
"""
runner_list = self._client.get_runner_github_info(self._path, self._prefix)
runner_list = self._client.list_runners(self._path, self._prefix)

if states is None:
return tuple(runner_list)
Expand Down
Loading
Loading