From 7ececd09d32d139aece16d2882ee06d6366d5a87 Mon Sep 17 00:00:00 2001 From: Vladimir Belitskiy Date: Mon, 16 Sep 2024 17:26:00 +0000 Subject: [PATCH 01/12] Use GH API to retrieve PR labels when waiting on connection. In GH Actions context, labels do not get updated on simply being added, usually requiring a change be pushed to refresh them: https://github.com/orgs/community/discussions/39062 --- .../workflows/wait-for-connection-test.yaml | 12 ++-- actions/ci_connection/action.yaml | 59 ++++++++++++++++--- 2 files changed, 58 insertions(+), 13 deletions(-) diff --git a/.github/workflows/wait-for-connection-test.yaml b/.github/workflows/wait-for-connection-test.yaml index c99306c505c2..aad898c9bd83 100644 --- a/.github/workflows/wait-for-connection-test.yaml +++ b/.github/workflows/wait-for-connection-test.yaml @@ -8,9 +8,13 @@ on: workflow_dispatch: inputs: halt-for-connection: - description: 'Should this invocation wait for a remote connection?' - required: false - default: '0' + description: 'Should this workflow run wait for a remote connection?' + type: choice + required: true + default: 'no' + options: + - 'yes' + - 'no' # Cancel any previous iterations if a new commit is pushed concurrency: group: ${{ github.workflow }}-${{ github.ref }} @@ -20,7 +24,7 @@ jobs: strategy: fail-fast: false matrix: - runner: ["linux-x86-n2-64","linux-arm64-t2a-48"] + runner: ["linux-x86-n2-16"] instances: ["1"] runs-on: ${{ matrix.runner }} timeout-minutes: 60 diff --git a/actions/ci_connection/action.yaml b/actions/ci_connection/action.yaml index 59dc39278db2..e72e1209f2c9 100644 --- a/actions/ci_connection/action.yaml +++ b/actions/ci_connection/action.yaml @@ -19,22 +19,63 @@ inputs: runs: using: "composite" steps: - - name: Print halt conditions + - name: Get current labels + shell: bash + id: get-labels + env: + FALLBACK_LABELS: ${{ toJSON(github.event.pull_request.labels) }} + run: | + # Fetch labels using GitHub API, since labels from context may be stale: + # https://github.com/orgs/community/discussions/39062 + response=$(curl -sL \ + -H "Accept: application/vnd.github+json" \ + -H "Authorization: Bearer ${{ github.token }}" \ + https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/labels) + + # In case of API failure, fall back on labels from workflow context + if echo "$response" | grep -q '"name":'; then + labels_json="$response" + else + echo "Could not retrieve labels via API, falling back to using context" + labels_json="${FALLBACK_LABELS}" + fi + + # Pick an existing Python alias + python_bin=$(which python3 2>/dev/null || which python) + + labels=$(echo "$labels_json" | $python_bin -c "import sys, json; labels=[label['name'] for label in json.load(sys.stdin)]; print(json.dumps(labels))") + + # fallback_labels=$(echo "${FALLBACK_LABELS}" | $python_bin -c "import sys, json; labels=[label['name'] for label in json.load(sys.stdin)]; print(json.dumps(labels))") + # echo 'Check out API labels:' + # echo "${labels}" + # echo 'check out context labels' + # echo "${fallback_labels}" + + # Set the labels as an output for this step, to be used in the next steps + echo "labels=$labels" >> $GITHUB_OUTPUT + + - name: Print out halt conditions shell: bash run: | - echo "All labels: ${{ toJSON(github.event.pull_request.labels.*.name) }}" + echo "All labels: ${{ steps.get-labels.outputs.labels }}" echo "Halt retry tag: ${{ inputs.should-wait-retry-tag }}" - echo "Halt always tag ${{ inputs.should-wait-always-tag}}" + echo "Halt always tag: ${{ inputs.should-wait-always-tag }}" echo "Should halt input: ${{ inputs.halt-dispatch-input }}" echo "Reattempt count: ${{ github.run_attempt }}" - echo "PR number ${{ github.event.number }}" - - name: Halt For Connection + echo "PR number: ${{ github.event.pull_request.number }}" + + - name: Halt for connection shell: bash + # fromJSON parses the search string into an actual array + # https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/evaluate-expressions-in-workflows-and-actions#example-matching-an-array-of-strings + + # Wait on 2+ retries, if the wait-on-retry label is set + # Always wait, if the always-wait label is set + # Always wait if the workflow was triggered manually, and the value was set if: | - (contains(github.event.pull_request.labels.*.name, inputs.should-wait-retry-tag) && github.run_attempt > 1) || - contains(github.event.pull_request.labels.*.name, inputs.should-wait-always-tag) || - inputs.halt-dispatch-input == '1' || - inputs.halt-dispatch-input == 'yes' + contains(fromJSON(steps.get-labels.outputs.labels), inputs.should-wait-retry-tag) && github.run_attempt > 1 || + contains(fromJSON(steps.get-labels.outputs.labels), inputs.should-wait-always-tag) || + contains(fromJSON('["yes", "Yes", "y", "1"]'), inputs.halt-dispatch-input) env: REPOSITORY: ${{ inputs.repository }} INTERACTIVE_CI: 1 From 08fbad7f4d8c13ce877b5d834af63351444373bb Mon Sep 17 00:00:00 2001 From: Vladimir Belitskiy Date: Mon, 16 Sep 2024 17:52:46 +0000 Subject: [PATCH 02/12] Safeguard against the case of no labels. --- actions/ci_connection/action.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/actions/ci_connection/action.yaml b/actions/ci_connection/action.yaml index e72e1209f2c9..bdd60c8e162a 100644 --- a/actions/ci_connection/action.yaml +++ b/actions/ci_connection/action.yaml @@ -33,6 +33,7 @@ runs: https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/labels) # In case of API failure, fall back on labels from workflow context + if echo "$response" | grep -q '"name":'; then labels_json="$response" else @@ -40,6 +41,12 @@ runs: labels_json="${FALLBACK_LABELS}" fi + # Safeguard against no labels, to prevent parsing failures + if [ -z "$labels_json" ]; then + echo "No PR labels detected." + labels_json='[]' + fi + # Pick an existing Python alias python_bin=$(which python3 2>/dev/null || which python) From 8ab2e5b4ad9206e5e28add4b514e09e0e3a050b8 Mon Sep 17 00:00:00 2001 From: Vladimir Belitskiy Date: Mon, 16 Sep 2024 17:57:23 +0000 Subject: [PATCH 03/12] Debug the labels. --- actions/ci_connection/action.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/actions/ci_connection/action.yaml b/actions/ci_connection/action.yaml index bdd60c8e162a..4d3b2bd5ffa9 100644 --- a/actions/ci_connection/action.yaml +++ b/actions/ci_connection/action.yaml @@ -41,6 +41,9 @@ runs: labels_json="${FALLBACK_LABELS}" fi + echo 'See the labels' + echo "$labels_json" + # Safeguard against no labels, to prevent parsing failures if [ -z "$labels_json" ]; then echo "No PR labels detected." From 88651451d43926d7e67b062d27810008964a59df Mon Sep 17 00:00:00 2001 From: Vladimir Belitskiy Date: Mon, 16 Sep 2024 18:06:52 +0000 Subject: [PATCH 04/12] Use an empty labels' list for branch runs. --- actions/ci_connection/action.yaml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/actions/ci_connection/action.yaml b/actions/ci_connection/action.yaml index 4d3b2bd5ffa9..644422c1e6e1 100644 --- a/actions/ci_connection/action.yaml +++ b/actions/ci_connection/action.yaml @@ -28,6 +28,7 @@ runs: # Fetch labels using GitHub API, since labels from context may be stale: # https://github.com/orgs/community/discussions/39062 response=$(curl -sL \ + -H "X-GitHub-Api-Version:2022-11-28" \ -H "Accept: application/vnd.github+json" \ -H "Authorization: Bearer ${{ github.token }}" \ https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/labels) @@ -41,11 +42,12 @@ runs: labels_json="${FALLBACK_LABELS}" fi - echo 'See the labels' - echo "$labels_json" + # echo 'See the labels' + # echo "$labels_json" - # Safeguard against no labels, to prevent parsing failures - if [ -z "$labels_json" ]; then + # Safeguard against no labels, to prevent parsing failures. + # This will happen when a run is triggered on a branch, instead of a PR. + if [ "$labels_json" == null ]; then echo "No PR labels detected." labels_json='[]' fi From 54a6fa86daa444a3c07415bfa58ff3cd7e2a4a93 Mon Sep 17 00:00:00 2001 From: Vladimir Belitskiy Date: Tue, 17 Sep 2024 18:06:59 +0000 Subject: [PATCH 05/12] Do label retrieval via Python. --- actions/ci_connection/action.yaml | 44 ++----------- actions/ci_connection/get_labels.py | 99 +++++++++++++++++++++++++++++ 2 files changed, 103 insertions(+), 40 deletions(-) create mode 100644 actions/ci_connection/get_labels.py diff --git a/actions/ci_connection/action.yaml b/actions/ci_connection/action.yaml index 644422c1e6e1..d3552522bc18 100644 --- a/actions/ci_connection/action.yaml +++ b/actions/ci_connection/action.yaml @@ -22,50 +22,14 @@ runs: - name: Get current labels shell: bash id: get-labels - env: - FALLBACK_LABELS: ${{ toJSON(github.event.pull_request.labels) }} run: | - # Fetch labels using GitHub API, since labels from context may be stale: - # https://github.com/orgs/community/discussions/39062 - response=$(curl -sL \ - -H "X-GitHub-Api-Version:2022-11-28" \ - -H "Accept: application/vnd.github+json" \ - -H "Authorization: Bearer ${{ github.token }}" \ - https://api.github.com/repos/${{ github.repository }}/issues/${{ github.event.pull_request.number }}/labels) - - # In case of API failure, fall back on labels from workflow context - - if echo "$response" | grep -q '"name":'; then - labels_json="$response" - else - echo "Could not retrieve labels via API, falling back to using context" - labels_json="${FALLBACK_LABELS}" - fi - - # echo 'See the labels' - # echo "$labels_json" - - # Safeguard against no labels, to prevent parsing failures. - # This will happen when a run is triggered on a branch, instead of a PR. - if [ "$labels_json" == null ]; then - echo "No PR labels detected." - labels_json='[]' - fi - # Pick an existing Python alias python_bin=$(which python3 2>/dev/null || which python) - - labels=$(echo "$labels_json" | $python_bin -c "import sys, json; labels=[label['name'] for label in json.load(sys.stdin)]; print(json.dumps(labels))") - - # fallback_labels=$(echo "${FALLBACK_LABELS}" | $python_bin -c "import sys, json; labels=[label['name'] for label in json.load(sys.stdin)]; print(json.dumps(labels))") - # echo 'Check out API labels:' - # echo "${labels}" - # echo 'check out context labels' - # echo "${fallback_labels}" + # Get the labels + labels=$($python_bin "$GITHUB_ACTION_PATH"/get_labels.py) + # Store them in the output, to be used in subsequent steps + echo "labels=$labels" >> "$GITHUB_OUTPUT" - # Set the labels as an output for this step, to be used in the next steps - echo "labels=$labels" >> $GITHUB_OUTPUT - - name: Print out halt conditions shell: bash run: | diff --git a/actions/ci_connection/get_labels.py b/actions/ci_connection/get_labels.py new file mode 100644 index 000000000000..b723faf83424 --- /dev/null +++ b/actions/ci_connection/get_labels.py @@ -0,0 +1,99 @@ +"""Retrieve labels for the PR from context, if any. + +While these labels are also available via GH context, and the event payload +file, they may be stale: +https://github.com/orgs/community/discussions/39062 + +As such, the API is used as the main source, with the event payload file +being the fallback. + +The script is only geared towards use within a GH Action run. +""" + +import json +import logging +import os +import re +import sys +import time +import urllib.request + + +# GET_LABELS_DEBUG is a variable specifically for this script. +# RUNNER_DEBUG and ACTIONS_RUNNER_DEBUG are GH env vars, which can be set +# in various ways, one of them - enabling debug logging from the UI, when +# triggering a run +# https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/store-information-in-variables#default-environment-variables +# https://docs.github.com/en/actions/monitoring-and-troubleshooting-workflows/troubleshooting-workflows/enabling-debug-logging#enabling-runner-diagnostic-logging +_SHOW_DEBUG = bool( + os.getenv('GET_LABELS_DEBUG', + os.getenv('RUNNER_DEBUG', + os.getenv('ACTIONS_RUNNER_DEBUG'))) +) +logging.basicConfig(level=logging.INFO if not _SHOW_DEBUG else logging.DEBUG, + format='%(levelname)s: %(message)s', stream=sys.stderr) + + +_GITHUB_REF = os.environ.get('GITHUB_REF') +# Outside a PR context - no labels to be found +if not _GITHUB_REF.startswith('refs/pull/'): + logging.debug('Not a PR run') + print([]) + raise SystemExit + +# Since passing the previous check confirms this is a PR, there's no need +# to safeguard this regex +GH_ISSUE = re.search(r'\d+', _GITHUB_REF).group() +GH_REPO = os.environ.get('GITHUB_REPOSITORY') + +logging.debug(f'{GH_ISSUE=!r}\n' + f'{GH_REPO=!r}') + +URL = f'https://api.github.com/repos/{GH_REPO}/issues/{GH_ISSUE}/labels' + +WAIT_TIME = 3 +ATTEMPTS = 3 + +data = None +cur_attempt = 1 + +while cur_attempt <= ATTEMPTS: + request = urllib.request.Request( + URL, + headers={'Accept': 'application/vnd.github+json', + 'X-GitHub-Api-Version': '2022-11-28'} + ) + logging.info(f'Retrieving PR labels via API - attempt {cur_attempt}...') + response = urllib.request.urlopen(request) + + if response.status == 200: + data = response.read().decode('utf-8') + logging.debug('API labels data: \n' + f'{data}') + break + else: + logging.error(f'Request failed with status code: {response.status}') + cur_attempt += 1 + if cur_attempt <= ATTEMPTS: + logging.info(f'Trying again in {WAIT_TIME} seconds') + time.sleep(WAIT_TIME) + + +# The null check is probably unnecessary, but rather be safe +if data and data != 'null': + data_json = json.loads(data) +else: + # Fall back on labels from the event's payload + event_payload_path = os.environ.get('GITHUB_EVENT_PATH') + with open(event_payload_path, 'r', encoding='utf-8') as event_payload: + data_json = json.load(event_payload).get('pull_request', + {}).get('labels', []) + logging.info('Using fallback labels') + logging.info(f'Fallback labels: \n' + f'{data_json}') + + +labels = [label['name'] for label in data_json] +logging.debug(f'Final labels: \n' + f'{labels}') +print(labels) From c107d68d1b7df4a6b306d5502bd4a3cffbed675f Mon Sep 17 00:00:00 2001 From: Vladimir Belitskiy Date: Tue, 17 Sep 2024 18:43:30 +0000 Subject: [PATCH 06/12] Minor touch-ups - comments, etc. --- actions/ci_connection/get_labels.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/actions/ci_connection/get_labels.py b/actions/ci_connection/get_labels.py index b723faf83424..2755cefa30d7 100644 --- a/actions/ci_connection/get_labels.py +++ b/actions/ci_connection/get_labels.py @@ -1,10 +1,10 @@ -"""Retrieve labels for the PR from context, if any. +"""Retrieve PR labels, if any. While these labels are also available via GH context, and the event payload file, they may be stale: https://github.com/orgs/community/discussions/39062 -As such, the API is used as the main source, with the event payload file +Thus, the API is used as the main source, with the event payload file being the fallback. The script is only geared towards use within a GH Action run. @@ -19,6 +19,7 @@ import urllib.request +# Check if debug logging should be enabled for the script: # GET_LABELS_DEBUG is a variable specifically for this script. # RUNNER_DEBUG and ACTIONS_RUNNER_DEBUG are GH env vars, which can be set # in various ways, one of them - enabling debug logging from the UI, when @@ -34,6 +35,7 @@ format='%(levelname)s: %(message)s', stream=sys.stderr) +# Check if this is a PR (pull request) _GITHUB_REF = os.environ.get('GITHUB_REF') # Outside a PR context - no labels to be found if not _GITHUB_REF.startswith('refs/pull/'): @@ -41,22 +43,22 @@ print([]) raise SystemExit +# Get the PR number # Since passing the previous check confirms this is a PR, there's no need # to safeguard this regex -GH_ISSUE = re.search(r'\d+', _GITHUB_REF).group() +GH_ISSUE = re.search(r'/refs/pull/(\d+)/merge', _GITHUB_REF).group(1) GH_REPO = os.environ.get('GITHUB_REPOSITORY') - -logging.debug(f'{GH_ISSUE=!r}\n' - f'{GH_REPO=!r}') - URL = f'https://api.github.com/repos/{GH_REPO}/issues/{GH_ISSUE}/labels' - WAIT_TIME = 3 ATTEMPTS = 3 data = None cur_attempt = 1 +logging.debug(f'{GH_ISSUE=!r}\n' + f'{GH_REPO=!r}') + +# Try retrieving the labels' info via API while cur_attempt <= ATTEMPTS: request = urllib.request.Request( URL, @@ -83,7 +85,7 @@ if data and data != 'null': data_json = json.loads(data) else: - # Fall back on labels from the event's payload + # Fall back on labels from the event's payload, if API failed (unlikely) event_payload_path = os.environ.get('GITHUB_EVENT_PATH') with open(event_payload_path, 'r', encoding='utf-8') as event_payload: data_json = json.load(event_payload).get('pull_request', @@ -96,4 +98,5 @@ labels = [label['name'] for label in data_json] logging.debug(f'Final labels: \n' f'{labels}') +# Output the labels to stdout for further use elsewhere print(labels) From 8249a140f184a99bc0912c5352066bb36c41f6cf Mon Sep 17 00:00:00 2001 From: Vladimir Belitskiy Date: Tue, 17 Sep 2024 18:49:10 +0000 Subject: [PATCH 07/12] Fix the regex after making it stricter. --- actions/ci_connection/action.yaml | 2 +- actions/ci_connection/get_labels.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/actions/ci_connection/action.yaml b/actions/ci_connection/action.yaml index d3552522bc18..cd3114173cec 100644 --- a/actions/ci_connection/action.yaml +++ b/actions/ci_connection/action.yaml @@ -38,7 +38,7 @@ runs: echo "Halt always tag: ${{ inputs.should-wait-always-tag }}" echo "Should halt input: ${{ inputs.halt-dispatch-input }}" echo "Reattempt count: ${{ github.run_attempt }}" - echo "PR number: ${{ github.event.pull_request.number }}" + echo "PR number: ${{ github.event.number }}" - name: Halt for connection shell: bash diff --git a/actions/ci_connection/get_labels.py b/actions/ci_connection/get_labels.py index 2755cefa30d7..54f2833d53ab 100644 --- a/actions/ci_connection/get_labels.py +++ b/actions/ci_connection/get_labels.py @@ -46,7 +46,7 @@ # Get the PR number # Since passing the previous check confirms this is a PR, there's no need # to safeguard this regex -GH_ISSUE = re.search(r'/refs/pull/(\d+)/merge', _GITHUB_REF).group(1) +GH_ISSUE = re.search(r'refs/pull/(\d+)/merge', _GITHUB_REF).group(1) GH_REPO = os.environ.get('GITHUB_REPOSITORY') URL = f'https://api.github.com/repos/{GH_REPO}/issues/{GH_ISSUE}/labels' WAIT_TIME = 3 From c6745345f23c6106645edaf4b2b01448f46d3c30 Mon Sep 17 00:00:00 2001 From: Vladimir Belitskiy Date: Wed, 18 Sep 2024 15:04:30 +0000 Subject: [PATCH 08/12] Merge ci_connection action steps, move condition-checking to Python. --- actions/ci_connection/__init__.py | 0 actions/ci_connection/action.yaml | 63 ++------ actions/ci_connection/get_labels.py | 155 +++++++++---------- actions/ci_connection/wait_for_connection.py | 154 ++++++++++++------ 4 files changed, 193 insertions(+), 179 deletions(-) create mode 100644 actions/ci_connection/__init__.py diff --git a/actions/ci_connection/__init__.py b/actions/ci_connection/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/actions/ci_connection/action.yaml b/actions/ci_connection/action.yaml index cd3114173cec..3795c11fab5b 100644 --- a/actions/ci_connection/action.yaml +++ b/actions/ci_connection/action.yaml @@ -1,61 +1,24 @@ name: "Wait For Connection" -description: 'Action to wait for connection from user' +description: 'Action to wait for connection from user (condtionally)' inputs: halt-dispatch-input: - description: 'Should the action wait for user connection from workflow_dispatch' + description: 'Should the action wait for user connection from workflow_dispatch?' required: false - default: "0" - should-wait-retry-tag: - description: "Tag that will flag action to wait on reruns if present" - required: false - default: "CI Connection Halt - On Retry" - should-wait-always-tag: - description: "Tag that will flag action to wait on reruns if present" - required: false - default: "CI Connection Halt - Always" - repository: - description: 'Repository name with owner. For example, actions/checkout' - default: ${{ github.repository }} + default: "no" runs: using: "composite" steps: - - name: Get current labels - shell: bash - id: get-labels - run: | - # Pick an existing Python alias - python_bin=$(which python3 2>/dev/null || which python) - # Get the labels - labels=$($python_bin "$GITHUB_ACTION_PATH"/get_labels.py) - # Store them in the output, to be used in subsequent steps - echo "labels=$labels" >> "$GITHUB_OUTPUT" - - - name: Print out halt conditions - shell: bash - run: | - echo "All labels: ${{ steps.get-labels.outputs.labels }}" - echo "Halt retry tag: ${{ inputs.should-wait-retry-tag }}" - echo "Halt always tag: ${{ inputs.should-wait-always-tag }}" - echo "Should halt input: ${{ inputs.halt-dispatch-input }}" - echo "Reattempt count: ${{ github.run_attempt }}" - echo "PR number: ${{ github.event.number }}" - - - name: Halt for connection + - name: Wait for connection (conditionally) shell: bash - # fromJSON parses the search string into an actual array - # https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/evaluate-expressions-in-workflows-and-actions#example-matching-an-array-of-strings - - # Wait on 2+ retries, if the wait-on-retry label is set - # Always wait, if the always-wait label is set - # Always wait if the workflow was triggered manually, and the value was set - if: | - contains(fromJSON(steps.get-labels.outputs.labels), inputs.should-wait-retry-tag) && github.run_attempt > 1 || - contains(fromJSON(steps.get-labels.outputs.labels), inputs.should-wait-always-tag) || - contains(fromJSON('["yes", "Yes", "y", "1"]'), inputs.halt-dispatch-input) env: - REPOSITORY: ${{ inputs.repository }} - INTERACTIVE_CI: 1 + # This variable is on by default, but can be overridden for convenience, + # in case the workflow calling this action should not be halted, despite + # other labels/inputs. + INTERACTIVE_CI: 1 PYTHONUNBUFFERED: 1 + HALT_DISPATCH_INPUT: ${{ inputs.halt-dispatch-input }} run: | - echo "$GITHUB_ACTION_PATH" - python3 $GITHUB_ACTION_PATH/wait_for_connection.py + # Pick an existing Python alias + python_bin=$(which python3 2>/dev/null || which python) + # Wait for the connection, if a halt was requested via a label/input + "$python_bin" "$GITHUB_ACTION_PATH/wait_for_connection.py" diff --git a/actions/ci_connection/get_labels.py b/actions/ci_connection/get_labels.py index 54f2833d53ab..ac6f7f3a6151 100644 --- a/actions/ci_connection/get_labels.py +++ b/actions/ci_connection/get_labels.py @@ -14,89 +14,84 @@ import logging import os import re -import sys import time import urllib.request -# Check if debug logging should be enabled for the script: -# GET_LABELS_DEBUG is a variable specifically for this script. -# RUNNER_DEBUG and ACTIONS_RUNNER_DEBUG are GH env vars, which can be set -# in various ways, one of them - enabling debug logging from the UI, when -# triggering a run -# https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/store-information-in-variables#default-environment-variables -# https://docs.github.com/en/actions/monitoring-and-troubleshooting-workflows/troubleshooting-workflows/enabling-debug-logging#enabling-runner-diagnostic-logging -_SHOW_DEBUG = bool( - os.getenv('GET_LABELS_DEBUG', - os.getenv('RUNNER_DEBUG', - os.getenv('ACTIONS_RUNNER_DEBUG'))) -) -logging.basicConfig(level=logging.INFO if not _SHOW_DEBUG else logging.DEBUG, - format='%(levelname)s: %(message)s', stream=sys.stderr) - - -# Check if this is a PR (pull request) -_GITHUB_REF = os.environ.get('GITHUB_REF') -# Outside a PR context - no labels to be found -if not _GITHUB_REF.startswith('refs/pull/'): - logging.debug('Not a PR run') - print([]) - raise SystemExit - -# Get the PR number -# Since passing the previous check confirms this is a PR, there's no need -# to safeguard this regex -GH_ISSUE = re.search(r'refs/pull/(\d+)/merge', _GITHUB_REF).group(1) -GH_REPO = os.environ.get('GITHUB_REPOSITORY') -URL = f'https://api.github.com/repos/{GH_REPO}/issues/{GH_ISSUE}/labels' -WAIT_TIME = 3 -ATTEMPTS = 3 - -data = None -cur_attempt = 1 - -logging.debug(f'{GH_ISSUE=!r}\n' - f'{GH_REPO=!r}') - -# Try retrieving the labels' info via API -while cur_attempt <= ATTEMPTS: - request = urllib.request.Request( - URL, - headers={'Accept': 'application/vnd.github+json', - 'X-GitHub-Api-Version': '2022-11-28'} +def retrieve_labels(print_to_stdout: bool = True) -> list[str]: + """Get the most up-to-date labels. + + In case this is not a PR, return an empty list. + """ + # Check if this is a PR (pull request) + github_ref = os.getenv('GITHUB_REF') + # Outside a PR context - no labels to be found + if not github_ref.startswith('refs/pull/'): + logging.debug('Not a PR workflow run, returning an empty label list') + if print_to_stdout: + print([]) + return [] + + # Get the PR number + # Since passing the previous check confirms this is a PR, there's no need + # to safeguard this regex + gh_issue = re.search(r'refs/pull/(\d+)/merge', github_ref).group(1) + gh_repo = os.getenv('GITHUB_REPOSITORY') + labels_url = ( + f'https://api.github.com/repos/{gh_repo}/issues/{gh_issue}/labels' ) - logging.info(f'Retrieving PR labels via API - attempt {cur_attempt}...') - response = urllib.request.urlopen(request) - - if response.status == 200: - data = response.read().decode('utf-8') - logging.debug('API labels data: \n' - f'{data}') - break + logging.debug(f'{gh_issue=!r}\n' + f'{gh_repo=!r}') + + wait_time = 3 + total_attempts = 3 + cur_attempt = 1 + data = None + + # Try retrieving the labels' info via API + while cur_attempt <= total_attempts: + request = urllib.request.Request( + labels_url, + headers={'Accept': 'application/vnd.github+json', + 'X-GitHub-Api-Version': '2022-11-28'} + ) + logging.info(f'Retrieving PR labels via API - attempt {cur_attempt}...') + response = urllib.request.urlopen(request) + + if response.status == 200: + data = response.read().decode('utf-8') + logging.debug('API labels data: \n' + f'{data}') + break + else: + logging.error(f'Request failed with status code: {response.status}') + cur_attempt += 1 + if cur_attempt <= total_attempts: + logging.info(f'Trying again in {wait_time} seconds') + time.sleep(wait_time) + + # The null check is probably unnecessary, but rather be safe + if data and data != 'null': + data_json = json.loads(data) else: - logging.error(f'Request failed with status code: {response.status}') - cur_attempt += 1 - if cur_attempt <= ATTEMPTS: - logging.info(f'Trying again in {WAIT_TIME} seconds') - time.sleep(WAIT_TIME) - - -# The null check is probably unnecessary, but rather be safe -if data and data != 'null': - data_json = json.loads(data) -else: - # Fall back on labels from the event's payload, if API failed (unlikely) - event_payload_path = os.environ.get('GITHUB_EVENT_PATH') - with open(event_payload_path, 'r', encoding='utf-8') as event_payload: - data_json = json.load(event_payload).get('pull_request', - {}).get('labels', []) - logging.info('Using fallback labels') - logging.info(f'Fallback labels: \n' - f'{data_json}') - - -labels = [label['name'] for label in data_json] -logging.debug(f'Final labels: \n' - f'{labels}') -# Output the labels to stdout for further use elsewhere -print(labels) + # Fall back on labels from the event's payload, if API failed (unlikely) + event_payload_path = os.getenv('GITHUB_EVENT_PATH') + with open(event_payload_path, 'r', encoding='utf-8') as event_payload: + data_json = json.load(event_payload).get('pull_request', + {}).get('labels', []) + logging.info('Using fallback labels') + logging.info(f'Fallback labels: \n' + f'{data_json}') + + labels = [label['name'] for label in data_json] + logging.debug(f'Final labels: \n' + f'{labels}') + + # Output the labels to stdout for further use elsewhere + if print_to_stdout: + print(labels) + return labels + + +if __name__ == '__main__': + retrieve_labels(print_to_stdout=True) diff --git a/actions/ci_connection/wait_for_connection.py b/actions/ci_connection/wait_for_connection.py index e752d0afb747..a510308835aa 100644 --- a/actions/ci_connection/wait_for_connection.py +++ b/actions/ci_connection/wait_for_connection.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import logging import os from multiprocessing.connection import Listener import time @@ -19,99 +20,154 @@ import threading import sys +from get_labels import retrieve_labels + +# Check if debug logging should be enabled for the script: +# WAIT_FOR_CONNECTION_DEBUG is a custom variable. +# RUNNER_DEBUG and ACTIONS_RUNNER_DEBUG are GH env vars, which can be set +# in various ways, one of them - enabling debug logging from the UI, when +# triggering a run: +# https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/store-information-in-variables#default-environment-variables +# https://docs.github.com/en/actions/monitoring-and-troubleshooting-workflows/troubleshooting-workflows/enabling-debug-logging#enabling-runner-diagnostic-logging +_SHOW_DEBUG = bool( + os.getenv('WAIT_FOR_CONNECTION_DEBUG', + os.getenv('RUNNER_DEBUG', + os.getenv('ACTIONS_RUNNER_DEBUG'))) +) +logging.basicConfig(level=logging.INFO if not _SHOW_DEBUG else logging.DEBUG, + format='%(levelname)s: %(message)s', stream=sys.stderr) + + last_time = time.time() timeout = 600 # 10 minutes for initial connection keep_alive_timeout = ( - 900 # 30 minutes for keep-alive if no closed message (allow for reconnects) + 900 # 30 minutes for keep-alive, if no closed message (allow for reconnects) ) +def should_halt_for_connection() -> bool: + """Check if the workflow should wait, due to inputs, vars, and labels.""" + + logging.info('Checking if the workflow should be halted for a connection...') + + interactive_ci_var = os.getenv("INTERACTIVE_CI", "").lower() + if (not interactive_ci_var or + interactive_ci_var in {'0', 'false', 'no', 'none', 'null', 'n/a'}): + logging.info("INTERACTIVE_CI env var is not" + "set, or is set to a falsy value in the workflow") + return False + + explicit_halt_requested = os.getenv("HALT_DISPATCH_INPUT") + if explicit_halt_requested: + logging.info("Halt for connection requested via " + "explicit `halt-dispatch-input` input") + return True + + # Check if any of the relevant labels are present + labels = retrieve_labels(print_to_stdout=False) + + # TODO(belitskiy): Add the ability to halt on CI error. + + # Note: there's always a small possibility these labels may change on the + # repo/org level, in which case, they'd need to be updated here as well. + always_halt_label = "CI Connection Halt - Always" + if always_halt_label in labels: + logging.info(f"Halt for connection requested via presence " + f"of the {always_halt_label!r} label") + return True + + attempt = int(os.getenv("GITHUB_RUN_ATTEMPT")) + halt_on_retry_label = "CI Connection Halt - On Retry" + if attempt > 1 and halt_on_retry_label in labels: + logging.info(f"Halt for connection requested via presence " + f"of the {halt_on_retry_label!r} label, " + f"due to workflow run attempt being 2+ ({attempt})") + return True + + return False + + def wait_for_notification(address): """Waits for connection notification from the listener.""" - global last_time + # TODO(belitskiy): Get rid of globals? + global last_time, timeout while True: with Listener(address) as listener: - print("Waiting for connection") + logging.info("Waiting for connection...") with listener.accept() as conn: while True: try: message = conn.recv() except EOFError as e: - print("EOFError occurred:", e) + logging.error("EOFError occurred:", e) break - print("Received message") + logging.info("Received message") if message == "keep_alive": - print("Keep alive received") + logging.info("Keep-alive received") last_time = time.time() continue # Keep-alive received, continue waiting elif message == "closed": - print("Connection closed by the other process.") + logging.info("Connection closed by the other process.") return # Graceful exit elif message == "connected": last_time = time.time() timeout = keep_alive_timeout - print("Connected") + logging.info("Connected") else: - print("Unknown message received:", message) + logging.warning("Unknown message received:", message) continue def timer(): while True: - print("Checking status") + logging.info("Checking status") time_elapsed = time.time() - last_time if time_elapsed < timeout: - print(f"Time since last keepalive {int(time_elapsed)}s") + logging.info(f"Time since last keep-alive: {int(time_elapsed)}s") else: - print("Timeout reached, exiting") + logging.info("Timeout reached, exiting") os.kill(os.getpid(), signal.SIGTERM) time.sleep(60) -if __name__ == "__main__": +def wait_for_connection(): address = ("localhost", 12455) # Address and port to listen on - # Check if we should wait for the connection - wait_for_connection = False - # if os.environ.get("WAIT_ON_ERROR") == "1": - # print("WAIT_ON_ERROR is set") - # if os.getppid() != 1: - # print("Previous command did not exit with success, waiting for connection") - # wait_for_connection = True - # else: - # print("Previous command exited with success") - # else: - # print("WAIT_ON_ERROR is not set") - - if os.environ.get("INTERACTIVE_CI") == "1": - print("INTERACTIVE_CI is set, waiting for connection") - wait_for_connection = True - else: - print("INTERACTIVE_CI is not set") - - if not wait_for_connection: - print("No condition was met to wait for connection. Continuing Job") - exit(0) - - # Grab and print the data required to connect to this vm - host = os.environ.get("HOSTNAME") - repo = os.environ.get("REPOSITORY") - cluster = os.environ.get("CONNECTION_CLUSTER") - location = os.environ.get("CONNECTION_LOCATION") - ns = os.environ.get("CONNECTION_NS") - actions_path = os.environ.get("GITHUB_ACTION_PATH") - - print("Googler connection only\nSee go/ for details") - print( - f"Connection string: ml-actions-connect --runner={host} --ns={ns} --loc={location} --cluster={cluster} --halt_directory={actions_path}" + + # Print out the data required to connect to this VM + host = os.getenv("HOSTNAME") + cluster = os.getenv("CONNECTION_CLUSTER") + location = os.getenv("CONNECTION_LOCATION") + ns = os.getenv("CONNECTION_NS") + actions_path = os.getenv("GITHUB_ACTION_PATH") + + logging.info("Googler connection only\n" + "See go/ml-github-actions:ssh for details") + logging.info( + f"Connection string: ml-actions-connect " + f"--runner={host} " + f"--ns={ns} " + f"--loc={location} " + f"--cluster={cluster} " + f"--halt_directory={actions_path}" ) - # Thread is running as a daemon so it will quit when the + # Thread is running as a daemon, so it will quit when the # main thread terminates. timer_thread = threading.Thread(target=timer, daemon=True) timer_thread.start() - wait_for_notification(address) # Wait for connection and get the connection object + # Wait for connection and get the connection object + wait_for_notification(address) - print("Exiting connection wait loop.") + logging.info("Exiting connection wait loop.") # Force a flush so we don't miss messages sys.stdout.flush() + + +if __name__ == "__main__": + if not should_halt_for_connection(): + logging.info('No conditions for halting the workflow' + 'for connection were met') + exit() + + wait_for_connection() From 21770de7cc22aa7368a4fc4108f5540ab5fca7d0 Mon Sep 17 00:00:00 2001 From: Vladimir Belitskiy Date: Wed, 18 Sep 2024 18:40:35 +0000 Subject: [PATCH 09/12] Slightly strengthen the halt input check. --- actions/ci_connection/wait_for_connection.py | 21 +++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/actions/ci_connection/wait_for_connection.py b/actions/ci_connection/wait_for_connection.py index a510308835aa..d08ccd6659ca 100644 --- a/actions/ci_connection/wait_for_connection.py +++ b/actions/ci_connection/wait_for_connection.py @@ -45,19 +45,25 @@ ) +def _is_truthy_env_var(var_name: str) -> bool: + var_val = os.getenv(var_name, "").lower() + negative_choices = {'0', 'false', 'n', 'no', 'none', 'null', 'n/a'} + if var_val and var_val not in negative_choices: + return True + return False + + def should_halt_for_connection() -> bool: """Check if the workflow should wait, due to inputs, vars, and labels.""" logging.info('Checking if the workflow should be halted for a connection...') - interactive_ci_var = os.getenv("INTERACTIVE_CI", "").lower() - if (not interactive_ci_var or - interactive_ci_var in {'0', 'false', 'no', 'none', 'null', 'n/a'}): - logging.info("INTERACTIVE_CI env var is not" + if not _is_truthy_env_var("INTERACTIVE_CI"): + logging.info("INTERACTIVE_CI env var is not " "set, or is set to a falsy value in the workflow") return False - explicit_halt_requested = os.getenv("HALT_DISPATCH_INPUT") + explicit_halt_requested = _is_truthy_env_var("HALT_DISPATCH_INPUT") if explicit_halt_requested: logging.info("Halt for connection requested via " "explicit `halt-dispatch-input` input") @@ -66,10 +72,11 @@ def should_halt_for_connection() -> bool: # Check if any of the relevant labels are present labels = retrieve_labels(print_to_stdout=False) + # Note: there's always a small possibility these labels may change on the + # repo/org level, in which case, they'd need to be updated below as well. + # TODO(belitskiy): Add the ability to halt on CI error. - # Note: there's always a small possibility these labels may change on the - # repo/org level, in which case, they'd need to be updated here as well. always_halt_label = "CI Connection Halt - Always" if always_halt_label in labels: logging.info(f"Halt for connection requested via presence " From 6500c18b1d3ed60e9b528959b6914daa249792c8 Mon Sep 17 00:00:00 2001 From: Vladimir Belitskiy Date: Thu, 19 Sep 2024 17:45:34 +0000 Subject: [PATCH 10/12] Don't fail the job on wait-for-connection step failure. --- actions/ci_connection/action.yaml | 2 ++ actions/ci_connection/get_labels.py | 20 +++++++++++++++++++- actions/ci_connection/wait_for_connection.py | 16 ++++++++-------- 3 files changed, 29 insertions(+), 9 deletions(-) diff --git a/actions/ci_connection/action.yaml b/actions/ci_connection/action.yaml index 3795c11fab5b..9b2696a29b4d 100644 --- a/actions/ci_connection/action.yaml +++ b/actions/ci_connection/action.yaml @@ -17,6 +17,8 @@ runs: INTERACTIVE_CI: 1 PYTHONUNBUFFERED: 1 HALT_DISPATCH_INPUT: ${{ inputs.halt-dispatch-input }} + # The calling workflow shouldn't fail in case this step does + continue-on-error: true run: | # Pick an existing Python alias python_bin=$(which python3 2>/dev/null || which python) diff --git a/actions/ci_connection/get_labels.py b/actions/ci_connection/get_labels.py index ac6f7f3a6151..47bff1376535 100644 --- a/actions/ci_connection/get_labels.py +++ b/actions/ci_connection/get_labels.py @@ -1,3 +1,17 @@ +# Copyright 2024 Google LLC + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at + +# https://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + """Retrieve PR labels, if any. While these labels are also available via GH context, and the event payload @@ -24,7 +38,11 @@ def retrieve_labels(print_to_stdout: bool = True) -> list[str]: In case this is not a PR, return an empty list. """ # Check if this is a PR (pull request) - github_ref = os.getenv('GITHUB_REF') + github_ref = os.getenv('GITHUB_REF', '') + if not github_ref: + raise TypeError('GITHUB_REF is not defined. ' + 'Is this being run outside of GitHub Actions?') + # Outside a PR context - no labels to be found if not github_ref.startswith('refs/pull/'): logging.debug('Not a PR workflow run, returning an empty label list') diff --git a/actions/ci_connection/wait_for_connection.py b/actions/ci_connection/wait_for_connection.py index d08ccd6659ca..38feb758029b 100644 --- a/actions/ci_connection/wait_for_connection.py +++ b/actions/ci_connection/wait_for_connection.py @@ -30,12 +30,12 @@ # https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/store-information-in-variables#default-environment-variables # https://docs.github.com/en/actions/monitoring-and-troubleshooting-workflows/troubleshooting-workflows/enabling-debug-logging#enabling-runner-diagnostic-logging _SHOW_DEBUG = bool( - os.getenv('WAIT_FOR_CONNECTION_DEBUG', - os.getenv('RUNNER_DEBUG', - os.getenv('ACTIONS_RUNNER_DEBUG'))) + os.getenv("WAIT_FOR_CONNECTION_DEBUG", + os.getenv("RUNNER_DEBUG", + os.getenv("ACTIONS_RUNNER_DEBUG"))) ) logging.basicConfig(level=logging.INFO if not _SHOW_DEBUG else logging.DEBUG, - format='%(levelname)s: %(message)s', stream=sys.stderr) + format="%(levelname)s: %(message)s", stream=sys.stderr) last_time = time.time() @@ -47,7 +47,7 @@ def _is_truthy_env_var(var_name: str) -> bool: var_val = os.getenv(var_name, "").lower() - negative_choices = {'0', 'false', 'n', 'no', 'none', 'null', 'n/a'} + negative_choices = {"0", "false", "n", "no", "none", "null", "n/a"} if var_val and var_val not in negative_choices: return True return False @@ -56,7 +56,7 @@ def _is_truthy_env_var(var_name: str) -> bool: def should_halt_for_connection() -> bool: """Check if the workflow should wait, due to inputs, vars, and labels.""" - logging.info('Checking if the workflow should be halted for a connection...') + logging.info("Checking if the workflow should be halted for a connection...") if not _is_truthy_env_var("INTERACTIVE_CI"): logging.info("INTERACTIVE_CI env var is not " @@ -173,8 +173,8 @@ def wait_for_connection(): if __name__ == "__main__": if not should_halt_for_connection(): - logging.info('No conditions for halting the workflow' - 'for connection were met') + logging.info("No conditions for halting the workflow" + "for connection were met") exit() wait_for_connection() From d43e84c3b9adca23bdae9b075483d509d7863cbb Mon Sep 17 00:00:00 2001 From: Vladimir Belitskiy Date: Fri, 27 Sep 2024 16:03:18 +0000 Subject: [PATCH 11/12] Source from actions repo. --- .github/workflows/wait-for-connection-test.yaml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/wait-for-connection-test.yaml b/.github/workflows/wait-for-connection-test.yaml index aad898c9bd83..ba50fa5c2504 100644 --- a/.github/workflows/wait-for-connection-test.yaml +++ b/.github/workflows/wait-for-connection-test.yaml @@ -34,8 +34,14 @@ jobs: steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # ratchet:actions/checkout@v4 # Halt for connection if workflow dispatch is told to or if it is a retry with the label halt_on_retry + - name: Get external actions + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # ratchet:actions/checkout@v4 + with: + repository: google-ml-infra/actions + ref: polish_ssh_connection_action + path: cool_actions - name: Wait For Connection - uses: ./actions/ci_connection/ + uses: ./cool_actions/ci_connection/ with: halt-dispatch-input: ${{ inputs.halt-for-connection }} - name: Echo From 5f3286633bf9325255804f1277fe1c3d29914558 Mon Sep 17 00:00:00 2001 From: Vladimir Belitskiy Date: Mon, 30 Sep 2024 13:15:32 +0000 Subject: [PATCH 12/12] Test out graceful timeout. --- .github/workflows/wait-for-connection-test.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/wait-for-connection-test.yaml b/.github/workflows/wait-for-connection-test.yaml index ba50fa5c2504..ddb5f45031c0 100644 --- a/.github/workflows/wait-for-connection-test.yaml +++ b/.github/workflows/wait-for-connection-test.yaml @@ -38,7 +38,7 @@ jobs: uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # ratchet:actions/checkout@v4 with: repository: google-ml-infra/actions - ref: polish_ssh_connection_action + ref: ssh_graceful_timeout path: cool_actions - name: Wait For Connection uses: ./cool_actions/ci_connection/