Skip to content

Commit 8c32cd2

Browse files
authored
Merge branch 'master' into improve-metrics
2 parents fb40b23 + 3c14ee0 commit 8c32cd2

File tree

9 files changed

+156
-35
lines changed

9 files changed

+156
-35
lines changed

.github/actions/execute-assert-arc-e2e/action.yaml

+15-2
Original file line numberDiff line numberDiff line change
@@ -188,15 +188,28 @@ runs:
188188
}
189189
core.setFailed(`The triggered workflow run didn't finish properly using ${{inputs.arc-name}}`)
190190
191+
- name: Gather listener logs
192+
shell: bash
193+
if: always()
194+
run: |
195+
LISTENER_POD="$(kubectl get autoscalinglisteners.actions.github.com -n arc-systems -o jsonpath='{.items[*].metadata.name}')"
196+
kubectl logs $LISTENER_POD -n ${{inputs.arc-controller-namespace}}
197+
198+
- name: Gather coredns logs
199+
shell: bash
200+
if: always()
201+
run: |
202+
kubectl logs deployments/coredns -n kube-system
203+
191204
- name: cleanup
192205
if: inputs.wait-to-finish == 'true'
193206
shell: bash
194207
run: |
195208
helm uninstall ${{ inputs.arc-name }} --namespace ${{inputs.arc-namespace}} --debug
196209
kubectl wait --timeout=30s --for=delete AutoScalingRunnerSet -n ${{inputs.arc-namespace}} -l app.kubernetes.io/instance=${{ inputs.arc-name }}
197210
198-
- name: Gather logs and cleanup
211+
- name: Gather controller logs
199212
shell: bash
200213
if: always()
201214
run: |
202-
kubectl logs deployment/arc-gha-rs-controller -n ${{inputs.arc-controller-namespace}}
215+
kubectl logs deployment/arc-gha-rs-controller -n ${{inputs.arc-controller-namespace}}

.github/workflows/gha-e2e-tests.yaml

+16
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,8 @@ jobs:
103103
kubectl wait --timeout=30s --for=condition=ready pod -n arc-systems -l actions.github.com/scale-set-name=$ARC_NAME
104104
kubectl get pod -n arc-systems
105105
106+
sleep 60
107+
106108
- name: Test ARC E2E
107109
uses: ./.github/actions/execute-assert-arc-e2e
108110
timeout-minutes: 10
@@ -194,6 +196,8 @@ jobs:
194196
kubectl wait --timeout=30s --for=condition=ready pod -n arc-systems -l actions.github.com/scale-set-name=$ARC_NAME
195197
kubectl get pod -n arc-systems
196198
199+
sleep 60
200+
197201
- name: Test ARC E2E
198202
uses: ./.github/actions/execute-assert-arc-e2e
199203
timeout-minutes: 10
@@ -284,6 +288,8 @@ jobs:
284288
kubectl wait --timeout=30s --for=condition=ready pod -n arc-systems -l actions.github.com/scale-set-name=$ARC_NAME
285289
kubectl get pod -n arc-systems
286290
291+
sleep 60
292+
287293
- name: Test ARC E2E
288294
uses: ./.github/actions/execute-assert-arc-e2e
289295
timeout-minutes: 10
@@ -383,6 +389,8 @@ jobs:
383389
kubectl wait --timeout=30s --for=condition=ready pod -n arc-systems -l actions.github.com/scale-set-name=$ARC_NAME
384390
kubectl get pod -n arc-systems
385391
392+
sleep 60
393+
386394
- name: Test ARC E2E
387395
uses: ./.github/actions/execute-assert-arc-e2e
388396
timeout-minutes: 10
@@ -484,6 +492,8 @@ jobs:
484492
kubectl wait --timeout=30s --for=condition=ready pod -n arc-systems -l actions.github.com/scale-set-name=$ARC_NAME
485493
kubectl get pod -n arc-systems
486494
495+
sleep 60
496+
487497
- name: Test ARC E2E
488498
uses: ./.github/actions/execute-assert-arc-e2e
489499
timeout-minutes: 10
@@ -579,6 +589,8 @@ jobs:
579589
kubectl wait --timeout=30s --for=condition=ready pod -n arc-systems -l actions.github.com/scale-set-name=$ARC_NAME
580590
kubectl get pod -n arc-systems
581591
592+
sleep 60
593+
582594
- name: Test ARC E2E
583595
uses: ./.github/actions/execute-assert-arc-e2e
584596
timeout-minutes: 10
@@ -699,6 +711,8 @@ jobs:
699711
kubectl wait --timeout=30s --for=condition=ready pod -n arc-systems -l actions.github.com/scale-set-name=$ARC_NAME
700712
kubectl get pod -n arc-systems
701713
714+
sleep 60
715+
702716
- name: Test ARC E2E
703717
uses: ./.github/actions/execute-assert-arc-e2e
704718
timeout-minutes: 10
@@ -789,6 +803,8 @@ jobs:
789803
kubectl wait --timeout=30s --for=condition=ready pod -n arc-systems -l actions.github.com/scale-set-name=$ARC_NAME
790804
kubectl get pod -n arc-systems
791805
806+
sleep 60
807+
792808
- name: Trigger long running jobs and wait for runners to pick them up
793809
uses: ./.github/actions/execute-assert-arc-e2e
794810
timeout-minutes: 10

charts/gha-runner-scale-set-controller/templates/deployment.yaml

+3
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,9 @@ spec:
6565
{{- with .Values.flags.watchSingleNamespace }}
6666
- "--watch-single-namespace={{ . }}"
6767
{{- end }}
68+
{{- with .Values.runnerMaxConcurrentReconciles }}
69+
- "--runner-max-concurrent-reconciles={{ . }}"
70+
{{- end }}
6871
{{- with .Values.flags.updateStrategy }}
6972
- "--update-strategy={{ . }}"
7073
{{- end }}

charts/gha-runner-scale-set-controller/values.yaml

+5
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,11 @@ flags:
106106
## Defaults to watch all namespaces when unset.
107107
# watchSingleNamespace: ""
108108

109+
## The maximum number of concurrent reconciles which can be run by the EphemeralRunner controller.
110+
# Increase this value to improve the throughput of the controller.
111+
# It may also increase the load on the API server and the external service (e.g. GitHub API).
112+
runnerMaxConcurrentReconciles: 2
113+
109114
## Defines how the controller should handle upgrades while having running jobs.
110115
##
111116
## The strategies available are:

controllers/actions.github.com/ephemeralrunner_controller.go

+46-23
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,9 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
178178

179179
if ephemeralRunner.Status.RunnerId == 0 {
180180
log.Info("Creating new ephemeral runner registration and updating status with runner config")
181-
return r.updateStatusWithRunnerConfig(ctx, ephemeralRunner, log)
181+
if r, err := r.updateStatusWithRunnerConfig(ctx, ephemeralRunner, log); r != nil {
182+
return *r, err
183+
}
182184
}
183185

184186
secret := new(corev1.Secret)
@@ -189,7 +191,17 @@ func (r *EphemeralRunnerReconciler) Reconcile(ctx context.Context, req ctrl.Requ
189191
}
190192
// create secret if not created
191193
log.Info("Creating new ephemeral runner secret for jitconfig.")
192-
return r.createSecret(ctx, ephemeralRunner, log)
194+
if r, err := r.createSecret(ctx, ephemeralRunner, log); r != nil {
195+
return *r, err
196+
}
197+
198+
// Retry to get the secret that was just created.
199+
// Otherwise, even though we want to continue to create the pod,
200+
// it fails due to the missing secret resulting in an invalid pod spec.
201+
if err := r.Get(ctx, req.NamespacedName, secret); err != nil {
202+
log.Error(err, "Failed to fetch secret")
203+
return ctrl.Result{}, err
204+
}
193205
}
194206

195207
pod := new(corev1.Pod)
@@ -511,12 +523,12 @@ func (r *EphemeralRunnerReconciler) deletePodAsFailed(ctx context.Context, ephem
511523

512524
// updateStatusWithRunnerConfig fetches runtime configuration needed by the runner
513525
// This method should always set .status.runnerId and .status.runnerJITConfig
514-
func (r *EphemeralRunnerReconciler) updateStatusWithRunnerConfig(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, log logr.Logger) (ctrl.Result, error) {
526+
func (r *EphemeralRunnerReconciler) updateStatusWithRunnerConfig(ctx context.Context, ephemeralRunner *v1alpha1.EphemeralRunner, log logr.Logger) (*ctrl.Result, error) {
515527
// Runner is not registered with the service. We need to register it first
516528
log.Info("Creating ephemeral runner JIT config")
517529
actionsClient, err := r.actionsClientFor(ctx, ephemeralRunner)
518530
if err != nil {
519-
return ctrl.Result{}, fmt.Errorf("failed to get actions client for generating JIT config: %v", err)
531+
return &ctrl.Result{}, fmt.Errorf("failed to get actions client for generating JIT config: %v", err)
520532
}
521533

522534
jitSettings := &actions.RunnerScaleSetJitRunnerSetting{
@@ -534,12 +546,12 @@ func (r *EphemeralRunnerReconciler) updateStatusWithRunnerConfig(ctx context.Con
534546
if err != nil {
535547
actionsError := &actions.ActionsError{}
536548
if !errors.As(err, &actionsError) {
537-
return ctrl.Result{}, fmt.Errorf("failed to generate JIT config with generic error: %v", err)
549+
return &ctrl.Result{}, fmt.Errorf("failed to generate JIT config with generic error: %v", err)
538550
}
539551

540552
if actionsError.StatusCode != http.StatusConflict ||
541553
!actionsError.IsException("AgentExistsException") {
542-
return ctrl.Result{}, fmt.Errorf("failed to generate JIT config with Actions service error: %v", err)
554+
return &ctrl.Result{}, fmt.Errorf("failed to generate JIT config with Actions service error: %v", err)
543555
}
544556

545557
// If the runner with the name we want already exists it means:
@@ -552,29 +564,29 @@ func (r *EphemeralRunnerReconciler) updateStatusWithRunnerConfig(ctx context.Con
552564
log.Info("Getting runner jit config failed with conflict error, trying to get the runner by name", "runnerName", ephemeralRunner.Name)
553565
existingRunner, err := actionsClient.GetRunnerByName(ctx, ephemeralRunner.Name)
554566
if err != nil {
555-
return ctrl.Result{}, fmt.Errorf("failed to get runner by name: %v", err)
567+
return &ctrl.Result{}, fmt.Errorf("failed to get runner by name: %v", err)
556568
}
557569

558570
if existingRunner == nil {
559571
log.Info("Runner with the same name does not exist, re-queuing the reconciliation")
560-
return ctrl.Result{Requeue: true}, nil
572+
return &ctrl.Result{Requeue: true}, nil
561573
}
562574

563575
log.Info("Found the runner with the same name", "runnerId", existingRunner.Id, "runnerScaleSetId", existingRunner.RunnerScaleSetId)
564576
if existingRunner.RunnerScaleSetId == ephemeralRunner.Spec.RunnerScaleSetId {
565577
log.Info("Removing the runner with the same name")
566578
err := actionsClient.RemoveRunner(ctx, int64(existingRunner.Id))
567579
if err != nil {
568-
return ctrl.Result{}, fmt.Errorf("failed to remove runner from the service: %v", err)
580+
return &ctrl.Result{}, fmt.Errorf("failed to remove runner from the service: %v", err)
569581
}
570582

571583
log.Info("Removed the runner with the same name, re-queuing the reconciliation")
572-
return ctrl.Result{Requeue: true}, nil
584+
return &ctrl.Result{Requeue: true}, nil
573585
}
574586

575587
// TODO: Do we want to mark the ephemeral runner as failed, and let EphemeralRunnerSet to clean it up, so we can recover from this situation?
576588
// The situation is that the EphemeralRunner's name is already used by something else to register a runner, and we can't take the control back.
577-
return ctrl.Result{}, fmt.Errorf("runner with the same name but doesn't belong to this RunnerScaleSet: %v", err)
589+
return &ctrl.Result{}, fmt.Errorf("runner with the same name but doesn't belong to this RunnerScaleSet: %v", err)
578590
}
579591
log.Info("Created ephemeral runner JIT config", "runnerId", jitConfig.Runner.Id)
580592

@@ -585,11 +597,20 @@ func (r *EphemeralRunnerReconciler) updateStatusWithRunnerConfig(ctx context.Con
585597
obj.Status.RunnerJITConfig = jitConfig.EncodedJITConfig
586598
})
587599
if err != nil {
588-
return ctrl.Result{}, fmt.Errorf("failed to update runner status for RunnerId/RunnerName/RunnerJITConfig: %v", err)
600+
return &ctrl.Result{}, fmt.Errorf("failed to update runner status for RunnerId/RunnerName/RunnerJITConfig: %v", err)
589601
}
590602

603+
// We want to continue without a requeue for faster pod creation.
604+
//
605+
// To do so, we update the status in-place, so that both continuing the loop and
606+
// and requeuing and skipping updateStatusWithRunnerConfig in the next loop, will
607+
// have the same effect.
608+
ephemeralRunner.Status.RunnerId = jitConfig.Runner.Id
609+
ephemeralRunner.Status.RunnerName = jitConfig.Runner.Name
610+
ephemeralRunner.Status.RunnerJITConfig = jitConfig.EncodedJITConfig
611+
591612
log.Info("Updated ephemeral runner status with runnerId and runnerJITConfig")
592-
return ctrl.Result{}, nil
613+
return nil, nil
593614
}
594615

595616
func (r *EphemeralRunnerReconciler) createPod(ctx context.Context, runner *v1alpha1.EphemeralRunner, secret *corev1.Secret, log logr.Logger) (ctrl.Result, error) {
@@ -665,21 +686,21 @@ func (r *EphemeralRunnerReconciler) createPod(ctx context.Context, runner *v1alp
665686
return ctrl.Result{}, nil
666687
}
667688

668-
func (r *EphemeralRunnerReconciler) createSecret(ctx context.Context, runner *v1alpha1.EphemeralRunner, log logr.Logger) (ctrl.Result, error) {
689+
func (r *EphemeralRunnerReconciler) createSecret(ctx context.Context, runner *v1alpha1.EphemeralRunner, log logr.Logger) (*ctrl.Result, error) {
669690
log.Info("Creating new secret for ephemeral runner")
670691
jitSecret := r.ResourceBuilder.newEphemeralRunnerJitSecret(runner)
671692

672693
if err := ctrl.SetControllerReference(runner, jitSecret, r.Scheme); err != nil {
673-
return ctrl.Result{}, fmt.Errorf("failed to set controller reference: %v", err)
694+
return &ctrl.Result{}, fmt.Errorf("failed to set controller reference: %v", err)
674695
}
675696

676697
log.Info("Created new secret spec for ephemeral runner")
677698
if err := r.Create(ctx, jitSecret); err != nil {
678-
return ctrl.Result{}, fmt.Errorf("failed to create jit secret: %v", err)
699+
return &ctrl.Result{}, fmt.Errorf("failed to create jit secret: %v", err)
679700
}
680701

681702
log.Info("Created ephemeral runner secret", "secretName", jitSecret.Name)
682-
return ctrl.Result{Requeue: true}, nil
703+
return nil, nil
683704
}
684705

685706
// updateRunStatusFromPod is responsible for updating non-exiting statuses.
@@ -823,12 +844,14 @@ func (r *EphemeralRunnerReconciler) deleteRunnerFromService(ctx context.Context,
823844
}
824845

825846
// SetupWithManager sets up the controller with the Manager.
826-
func (r *EphemeralRunnerReconciler) SetupWithManager(mgr ctrl.Manager) error {
827-
return ctrl.NewControllerManagedBy(mgr).
828-
For(&v1alpha1.EphemeralRunner{}).
829-
Owns(&corev1.Pod{}).
830-
WithEventFilter(predicate.ResourceVersionChangedPredicate{}).
831-
Complete(r)
847+
func (r *EphemeralRunnerReconciler) SetupWithManager(mgr ctrl.Manager, opts ...Option) error {
848+
return builderWithOptions(
849+
ctrl.NewControllerManagedBy(mgr).
850+
For(&v1alpha1.EphemeralRunner{}).
851+
Owns(&corev1.Pod{}).
852+
WithEventFilter(predicate.ResourceVersionChangedPredicate{}),
853+
opts,
854+
).Complete(r)
832855
}
833856

834857
func runnerContainerStatus(pod *corev1.Pod) *corev1.ContainerStatus {
+56
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
package actionsgithubcom
2+
3+
import (
4+
"sigs.k8s.io/controller-runtime/pkg/builder"
5+
"sigs.k8s.io/controller-runtime/pkg/controller"
6+
)
7+
8+
// Options is the optional configuration for the controllers, which can be
9+
// set via command-line flags or environment variables.
10+
type Options struct {
11+
// RunnerMaxConcurrentReconciles is the maximum number of concurrent Reconciles which can be run
12+
// by the EphemeralRunnerController.
13+
RunnerMaxConcurrentReconciles int
14+
}
15+
16+
// OptionsWithDefault returns the default options.
17+
// This is here to maintain the options and their default values in one place,
18+
// rather than having to correlate those in multiple places.
19+
func OptionsWithDefault() Options {
20+
return Options{
21+
RunnerMaxConcurrentReconciles: 2,
22+
}
23+
}
24+
25+
type Option func(*controller.Options)
26+
27+
// WithMaxConcurrentReconciles sets the maximum number of concurrent Reconciles which can be run.
28+
//
29+
// This is useful to improve the throughput of the controller, but it may also increase the load on the API server and
30+
// the external service (e.g. GitHub API). The default value is 1, as defined by the controller-runtime.
31+
//
32+
// See https://github.com/actions/actions-runner-controller/issues/3021 for more information
33+
// on real-world use cases and the potential impact of this option.
34+
func WithMaxConcurrentReconciles(n int) Option {
35+
return func(b *controller.Options) {
36+
b.MaxConcurrentReconciles = n
37+
}
38+
}
39+
40+
// builderWithOptions applies the given options to the provided builder, if any.
41+
// This is a helper function to avoid the need to import the controller-runtime package in every reconciler source file
42+
// and the command package that creates the controller.
43+
// This is also useful for reducing code duplication around setting controller options in
44+
// multiple reconcilers.
45+
func builderWithOptions(b *builder.Builder, opts []Option) *builder.Builder {
46+
if len(opts) == 0 {
47+
return b
48+
}
49+
50+
var controllerOpts controller.Options
51+
for _, opt := range opts {
52+
opt(&controllerOpts)
53+
}
54+
55+
return b.WithOptions(controllerOpts)
56+
}

go.mod

+3-3
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,11 @@ module github.com/actions/actions-runner-controller
33
go 1.22.4
44

55
require (
6-
github.com/bradleyfalzon/ghinstallation/v2 v2.8.0
6+
github.com/bradleyfalzon/ghinstallation/v2 v2.12.0
77
github.com/davecgh/go-spew v1.1.1
88
github.com/evanphx/json-patch v5.9.0+incompatible
99
github.com/go-logr/logr v1.4.1
10-
github.com/golang-jwt/jwt/v4 v4.5.0
10+
github.com/golang-jwt/jwt/v4 v4.5.1
1111
github.com/google/go-cmp v0.6.0
1212
github.com/google/go-github/v52 v52.0.0
1313
github.com/google/uuid v1.6.0
@@ -60,7 +60,7 @@ require (
6060
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
6161
github.com/golang/protobuf v1.5.3 // indirect
6262
github.com/google/gnostic-models v0.6.9-0.20230804172637-c7be7c783f49 // indirect
63-
github.com/google/go-github/v56 v56.0.0 // indirect
63+
github.com/google/go-github/v66 v66.0.0 // indirect
6464
github.com/google/go-querystring v1.1.0 // indirect
6565
github.com/google/gofuzz v1.2.0 // indirect
6666
github.com/google/pprof v0.0.0-20231101202521-4ca4178f5c7a // indirect

0 commit comments

Comments
 (0)