Skip to content

Remove SSH creation health checks #534

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ class TokenError(PlatformClientError):


class JobNotFoundError(PlatformClientError):
"""Represents an error when the job could not be found on GitHub."""
"""Represents an error when the job could not be found on the platform."""


class CloudError(Exception):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@
from urllib.error import HTTPError

import requests

# HTTP404NotFoundError is not found by pylint
from fastcore.net import HTTP404NotFoundError # pylint: disable=no-name-in-module
from ghapi.all import GhApi, pages
from ghapi.page import paged
from requests import RequestException
Expand All @@ -29,6 +32,11 @@

logger = logging.getLogger(__name__)


class GithubRunnerNotFoundError(Exception):
"""Represents an error when the runner could not be found on GitHub."""


# Parameters of the function decorated with retry
ParamT = ParamSpec("ParamT")
# Return type of the function decorated with retry
Expand Down Expand Up @@ -88,6 +96,42 @@ def __init__(self, token: str):
self._token = token
self._client = GhApi(token=self._token)

@catch_http_errors
def get_runner_info(self, path: GitHubPath, prefix: str, runner_id: int) -> SelfHostedRunner:
"""TODO.

https://docs.github.com/en/rest/actions/self-hosted-runners?
apiVersion=2022-11-28#get-a-self-hosted-runner-for-an-organization
/orgs/{org}/actions/runners/{runner_id}
https://docs.github.com/en/rest/actions/self-hosted-runners?
apiVersion=2022-11-28#get-a-self-hosted-runner-for-a-repository
/repos/{owner}/{repo}/actions/runners/{runner_id}

Args:
path: TODO
prefix: TODO
runner_id: TODO

Raises:
GithubRunnerNotFoundError: TODO

Returns:
TODO
"""
try:
if isinstance(path, GitHubRepo):
raw_runner = self._client.actions.get_self_hosted_runner_for_repo(
path.owner, path.repo, runner_id
)
else:
raw_runner = self._client.actions.get_self_hosted_runner_for_org(
path.org, runner_id
)
except HTTP404NotFoundError as err:
raise GithubRunnerNotFoundError from err
instance_id = InstanceID.build_from_name(prefix, raw_runner["name"])
return SelfHostedRunner.build_from_github(raw_runner, instance_id)

@catch_http_errors
def get_runner_github_info(self, path: GitHubPath, prefix: str) -> list[SelfHostedRunner]:
"""Get runner information on GitHub under a repo or org.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import copy
import logging
import time
from dataclasses import dataclass
from enum import Enum, auto
from multiprocessing import Pool
Expand All @@ -23,11 +24,18 @@
from github_runner_manager.metrics import github as github_metrics
from github_runner_manager.metrics import runner as runner_metrics
from github_runner_manager.metrics.runner import RunnerMetrics
from github_runner_manager.platform.platform_provider import PlatformProvider, PlatformRunnerState
from github_runner_manager.types_.github import SelfHostedRunner
from github_runner_manager.platform.platform_provider import (
PlatformProvider,
PlatformRunnerState,
RunnerNotFoundError,
)
from github_runner_manager.types_.github import GitHubRunnerStatus, SelfHostedRunner

logger = logging.getLogger(__name__)

# TODO this deserves a comment
RUNNER_CREATION_WAITING_TIMES = (60, 60, 120, 240, 480)

IssuedMetricEventsStats = dict[Type[metric_events.Event], int]


Expand Down Expand Up @@ -474,16 +482,54 @@ def _create_runner(args: _CreateRunnerArgs) -> InstanceID:
args.metadata.runner_id = str(github_runner.id)

try:
args.cloud_runner_manager.create_runner(
cloud_instance = args.cloud_runner_manager.create_runner(
instance_id=instance_id,
metadata=args.metadata,
runner_context=runner_context,
)
logger.info("JAVI cloud_instance %s", cloud_instance)
logger.info("JAVI metadata %s", args.metadata)

# TODO WAIT FOR RUNNER ONLINE IN HERE!
# TODO THIS CODE SHOULD DISAPPEAR AND ONLY WAIT FOR THE RUNNER IN REACTIVE MOD
# (TO CHECK IF THE JOB WAS TAKEN)
RunnerManager._wait_for_runner_online(
platform_provider=args.platform_provider,
instance_id=instance_id,
metadata=args.metadata,
)

logger.info("JAVI after runner created and waited")

except RunnerError:
# try to clean the runner in GitHub. This is necessary, as for reactive runners
# we do not know in the clean up if the runner is offline because if failed or
# because it is being created.
logger.warning("Deleting runner %s from platform", instance_id)
logger.warning("Deleting runner %s from platform after creation failed", instance_id)
args.platform_provider.delete_runners([github_runner])
raise
return instance_id

@staticmethod
def _wait_for_runner_online(
platform_provider: PlatformProvider,
instance_id: InstanceID,
metadata: RunnerMetadata,
) -> None:
"""TODO."""
for wait_time in RUNNER_CREATION_WAITING_TIMES:
time.sleep(wait_time)
try:
runner = platform_provider.get_runner(metadata, instance_id)
except RunnerNotFoundError:
# TODO SHOULD WE RAISE RunnerError in here? We expect a runner to be in the
# platform, and we will save time...
logger.error("JAVI Runner not found")
break
logger.info("JAVI github runner %s", runner)
# TODO REVIEW THE ONLINE THING FOR JOBMANAGER. WHAT IS ONLINE
# AND OFFLINE IN THAT CASE?
if runner.status == GitHubRunnerStatus.ONLINE or runner.deletable:
logger.info("JAVI nice! runner online!")
break
time.sleep(wait_time)
else:
logger.info("JAVI grrr runner never got online!")
raise RunnerError("Runner did not get online")
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from typing import Iterable, Sequence

import fabric
import invoke
import jinja2
import paramiko
from fabric import Connection as SSHConnection
Expand All @@ -26,7 +25,6 @@
SSHError,
)
from github_runner_manager.manager.cloud_runner_manager import (
CloudInitStatus,
CloudRunnerInstance,
CloudRunnerManager,
CloudRunnerState,
Expand Down Expand Up @@ -158,11 +156,6 @@ def create_runner(
except OpenStackError as err:
raise RunnerCreateError(f"Failed to create {instance_id} openstack runner") from err

logger.info("Waiting for runner process to startup: %s", instance.instance_id)
self._wait_runner_startup(instance)
logger.info("Waiting for runner process to be running: %s", instance.instance_id)
self._wait_runner_running(instance)

logger.info("Runner %s created successfully", instance.instance_id)
return self._build_cloud_runner_instance(instance)

Expand Down Expand Up @@ -508,47 +501,6 @@ def _check_state_and_flush(self, instance: OpenstackInstance, busy: bool) -> Non
result.stderr,
)

@retry(tries=10, delay=60, local_logger=logger)
def _wait_runner_startup(self, instance: OpenstackInstance) -> None:
"""Wait until runner is startup.

Args:
instance: The runner instance.

Raises:
RunnerStartError: The runner startup process was not found on the runner.
"""
try:
ssh_conn = self._openstack_cloud.get_ssh_connection(instance)
except SSHError as err:
raise RunnerStartError(
f"Failed to SSH to {instance.instance_id} during creation possible due to setup "
"not completed"
) from err

logger.debug("Running `cloud-init status` on instance %s.", instance.instance_id)
result: invoke.runners.Result = ssh_conn.run("cloud-init status", warn=True, timeout=60)
if not result.ok:
logger.warning(
"cloud-init status command failed on %s: %s.", instance.instance_id, result.stderr
)
raise RunnerStartError(f"Runner startup process not found on {instance.instance_id}")
# A short running job may have already completed and exited the runner, hence check the
# condition via cloud-init status check.
if CloudInitStatus.DONE in result.stdout:
return
logger.debug("Running `ps aux` on instance %s.", instance.instance_id)
result = ssh_conn.run("ps aux", warn=True, timeout=60, hide=True)
if not result.ok:
logger.warning("SSH run of `ps aux` failed on %s", instance.instance_id)
raise RunnerStartError(f"Unable to SSH run `ps aux` on {instance.instance_id}")
# Runner startup process is the parent process of runner.Listener and runner.Worker which
# starts up much faster.
if RUNNER_STARTUP_PROCESS not in result.stdout:
logger.warning("Runner startup process not found on %s", instance.instance_id)
raise RunnerStartError(f"Runner startup process not found on {instance.instance_id}")
logger.info("Runner startup process found to be healthy on %s", instance.instance_id)

@retry(tries=5, delay=60, local_logger=logger)
def _wait_runner_running(self, instance: OpenstackInstance) -> None:
"""Wait until runner is running.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,14 @@

from github_runner_manager.configuration.github import GitHubConfiguration, GitHubRepo
from github_runner_manager.errors import JobNotFoundError as GithubJobNotFoundError
from github_runner_manager.github_client import GithubClient
from github_runner_manager.github_client import GithubClient, GithubRunnerNotFoundError
from github_runner_manager.manager.models import InstanceID, RunnerContext, RunnerMetadata
from github_runner_manager.platform.platform_provider import (
JobInfo,
JobNotFoundError,
PlatformProvider,
PlatformRunnerState,
RunnerNotFoundError,
)
from github_runner_manager.types_.github import SelfHostedRunner

Expand Down Expand Up @@ -58,6 +59,32 @@ def build(
github_client=GithubClient(github_configuration.token),
)

def get_runner(
self,
metadata: RunnerMetadata,
instance_id: InstanceID,
) -> SelfHostedRunner:
"""Get info on self-hosted runner.

Args:
metadata: Metadata for the runner.
instance_id: Instance ID of the runner.

Raises:
RunnerNotFoundError: Work in progress.

Returns:
TODO
"""
try:
runner = self._client.get_runner_info(
self._path, self._prefix, int(metadata.runner_id)
)
except GithubRunnerNotFoundError:
raise RunnerNotFoundError from GithubRunnerNotFoundError

return runner

def get_runners(
self, states: Iterable[PlatformRunnerState] | None = None
) -> tuple[SelfHostedRunner, ...]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,55 @@ def build(cls) -> "JobManagerPlatform":
"""
return cls()

def get_runner(
self,
metadata: RunnerMetadata,
instance_id: InstanceID,
) -> SelfHostedRunner:
"""Get info on self-hosted runner.

Args:
metadata: Metadata for the runner.
instance_id: Instance ID of the runner.

Raises:
PlatformApiError: TODO

Returns:
The runner in the jobmanager.
"""
configuration = jobmanager_client.Configuration(host=metadata.url)
with jobmanager_client.ApiClient(configuration) as api_client:
api_instance = jobmanager_client.DefaultApi(api_client)
# response = api_instance.v1_jobs_job_id_get(int(metadata.runner_id))
try:
response = api_instance.v1_jobs_job_id_health_get(int(metadata.runner_id))
# HANDLE JOB NOT FOUND
except ApiException as exc:
logger.exception("Error calling jobmanager api.")
raise PlatformApiError("API error") from exc

# response.status one of: PENDING, IN_PROGRESS, COMPLETED, FAILED, CANCELLED
# response.deletable

busy = response.status == "FAILED"
status = (
GitHubRunnerStatus.OFFLINE
if response.status == "FAILED"
else GitHubRunnerStatus.ONLINE
)
return SelfHostedRunner(
busy=busy,
id=int(metadata.runner_id),
metadata=metadata,
# TODO unfortunately, we only have one label.
labels=[SelfHostedRunnerLabel(response.label)],
# status
status=status,
instance_id=instance_id,
deletable=response.deletable,
)

def get_runners(
self, states: Iterable[PlatformRunnerState] | None = None
) -> tuple[SelfHostedRunner, ...]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,22 @@ def build(
jobmanager_platform = JobManagerPlatform.build()
return cls({"github": github_platform, "jobmanager": jobmanager_platform})

def get_runner(
self,
metadata: RunnerMetadata,
instance_id: InstanceID,
) -> SelfHostedRunner:
"""Get info on self-hosted runner.

Args:
metadata: Metadata for the runner.
instance_id: Instance ID of the runner.

Returns:
Platform Runner information.
"""
return self._get_provider(metadata).get_runner(metadata, instance_id)

def get_runners(
self, states: Iterable[PlatformRunnerState] | None = None
) -> tuple[SelfHostedRunner, ...]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,26 @@ class JobNotFoundError(PlatformError):
"""Represents an error when the job could not be found on the platform."""


class RunnerNotFoundError(PlatformError):
"""Represents an error when the runner could not be found on the platform."""


class PlatformProvider(abc.ABC):
"""Base class for a Platform Provider."""

@abc.abstractmethod
def get_runner(
self,
metadata: RunnerMetadata,
instance_id: InstanceID,
) -> SelfHostedRunner:
"""Get info on self-hosted runner.

Args:
metadata: Metadata for the runner.
instance_id: Instance ID of the runner.
"""

@abc.abstractmethod
def get_runners(
self, states: "Iterable[PlatformRunnerState] | None" = None
Expand Down
Loading
Loading