Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions .chloggen/githubreceiver-retry-transient-errors.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Use this changelog template to create an entry for release notes.

# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
change_type: bug_fix

# The name of the component, or a single word describing the area of concern, (e.g. receiver/filelog)
component: receiver/github

# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`).
note: Add configurable retry with exponential backoff for transient GitHub API errors (429, 502, 503, 504, secondary rate limits)

# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
issues: [43388]

# (Optional) One or more lines of additional information to render under the primary note.
# These lines will be padded with 2 spaces and then inserted directly into the document.
# Use pipe (|) for multiline entries.
subtext:

# If your change doesn't affect end users or the exported elements of any package,
# you should instead start your pull request title with [chore] or use the "Skip Changelog" label.
# Optional: The change log or logs in which this entry should be included.
# e.g. '[user]' or '[user, api]'
# Include 'user' if the change is relevant to end users.
# Include 'api' if there is a change to a library API.
# Default: '[user]'
change_logs: []
3 changes: 2 additions & 1 deletion receiver/githubreceiver/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ go 1.25.0

require (
github.com/Khan/genqlient v0.8.1
github.com/cenkalti/backoff/v5 v5.0.3
github.com/google/go-cmp v0.7.0
github.com/google/go-github/v84 v84.0.0
github.com/gorilla/mux v1.8.1
Expand All @@ -16,6 +17,7 @@ require (
go.opentelemetry.io/collector/config/confighttp v0.150.1-0.20260415114935-307e3abdbae9
go.opentelemetry.io/collector/config/confignet v1.56.1-0.20260415114935-307e3abdbae9
go.opentelemetry.io/collector/config/configopaque v1.56.1-0.20260415114935-307e3abdbae9
go.opentelemetry.io/collector/config/configretry v1.56.1-0.20260415114935-307e3abdbae9
go.opentelemetry.io/collector/confmap v1.56.1-0.20260415114935-307e3abdbae9
go.opentelemetry.io/collector/consumer v1.56.1-0.20260415114935-307e3abdbae9
go.opentelemetry.io/collector/consumer/consumertest v0.150.1-0.20260415114935-307e3abdbae9
Expand All @@ -40,7 +42,6 @@ require (
github.com/alexflint/go-scalar v1.2.0 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/bmatcuk/doublestar/v4 v4.6.1 // indirect
github.com/cenkalti/backoff/v5 v5.0.3 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
github.com/ebitengine/purego v0.10.0 // indirect
Expand Down
4 changes: 2 additions & 2 deletions receiver/githubreceiver/go.sum

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -60,14 +60,45 @@ less per instance, or use `concurrency_limit` to control concurrent requests
- `collection_interval` should be long enough to avoid rate limiting (see above
formula). A sensible default is `300s`.

### Automatic Retry

The scraper automatically retries requests that fail with transient HTTP errors
using exponential backoff with jitter.

The following responses are retried:

- **403 Forbidden** with `Retry-After` header -- secondary rate limit
- **429 Too Many Requests** -- primary rate limit exceeded
- **502 Bad Gateway** -- GitHub's proxy failed to reach the backend
- **503 Service Unavailable** -- GitHub is temporarily down for maintenance
- **504 Gateway Timeout** -- GitHub's backend took too long to respond

Plain 403 responses (permission errors) are **not** retried. Retries are
bounded by `max_retries` (default 10) and the scrape context, stopping when
the next collection interval begins.

Retry behaviour is configurable under `retry_on_failure`:

```yaml
github:
github_org: my-org
retry_on_failure:
enabled: true # default
max_retries: 10 # default; 0 = unlimited (bounded by context)
initial_interval: 1s # default
max_interval: 30s # default
multiplier: 1.5 # default
randomization_factor: 0.5 # default
```

### Configuration

#### Concurrency Limiting

**Important**: This does not guarantee that the secondary rate limit will not be
hit. It simply reduces the likelihood. In large repositories with lots of
history to iterate through, the chance of hitting the secondary rate limit
increases. If this value is too high, 504/502/403 errors will show up.
increases. If this value is too high, 403/429/502/503/504 errors may show up.

The scraper supports limiting the number of concurrent repository processing
goroutines to reduce the likelihood of hitting GitHub's 100 concurrent secondary
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,20 @@ import (
"errors"

"go.opentelemetry.io/collector/config/confighttp"
"go.opentelemetry.io/collector/config/configretry"

"github.com/open-telemetry/opentelemetry-collector-contrib/receiver/githubreceiver/internal"
"github.com/open-telemetry/opentelemetry-collector-contrib/receiver/githubreceiver/internal/metadata"
)

// RetryConfig defines retry behavior for transient GitHub API errors.
type RetryConfig struct {
configretry.BackOffConfig `mapstructure:",squash"`
// MaxRetries is the maximum number of retry attempts per request.
// Set to 0 to rely solely on MaxElapsedTime / context cancellation.
MaxRetries int `mapstructure:"max_retries"`
}

// Config relating to GitHub Metric Scraper.
type Config struct {
confighttp.ClientConfig `mapstructure:",squash"`
Expand All @@ -29,6 +38,8 @@ type Config struct {
MergedPRLookbackDays int `mapstructure:"merged_pr_lookback_days"`
// SearchQuery is the query to use when defining a custom search for repository data
SearchQuery string `mapstructure:"search_query"`
// RetryConfig defines retry behavior for transient GitHub API errors.
RetryConfig RetryConfig `mapstructure:"retry_on_failure"`
}

// Validate validates the configuration
Expand All @@ -39,5 +50,11 @@ func (cfg *Config) Validate() error {
if cfg.MergedPRLookbackDays < 0 {
return errors.New("merged_pr_lookback_days must be non-negative")
}
if cfg.RetryConfig.MaxRetries < 0 {
return errors.New("max_retries must be non-negative")
}
if err := cfg.RetryConfig.Validate(); err != nil {
return err
}
return nil
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@ import (
"testing"
"time"

"github.com/cenkalti/backoff/v5"
"github.com/stretchr/testify/assert"
"go.opentelemetry.io/collector/config/confighttp"
"go.opentelemetry.io/collector/config/configretry"

"github.com/open-telemetry/opentelemetry-collector-contrib/receiver/githubreceiver/internal/metadata"
)
Expand All @@ -27,6 +29,16 @@ func TestConfig(t *testing.T) {
ClientConfig: clientConfig,
ConcurrencyLimit: 50,
MergedPRLookbackDays: 30,
RetryConfig: RetryConfig{
BackOffConfig: configretry.BackOffConfig{
Enabled: true,
InitialInterval: 1 * time.Second,
RandomizationFactor: backoff.DefaultRandomizationFactor,
Multiplier: backoff.DefaultMultiplier,
MaxInterval: 30 * time.Second,
},
MaxRetries: 10,
},
}

assert.Equal(t, expectedConfig, defaultConfig)
Expand Down Expand Up @@ -83,6 +95,26 @@ func TestConfigValidate(t *testing.T) {
},
wantErr: true,
},
{
name: "invalid config with negative max retries",
config: Config{
ConcurrencyLimit: 50,
RetryConfig: RetryConfig{
MaxRetries: -1,
},
},
wantErr: true,
},
{
name: "valid config with zero max retries (unlimited via time/context)",
config: Config{
ConcurrencyLimit: 50,
RetryConfig: RetryConfig{
MaxRetries: 0,
},
},
wantErr: false,
},
}

for _, tt := range tests {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@ import (
"context"
"time"

"github.com/cenkalti/backoff/v5"
"go.opentelemetry.io/collector/config/confighttp"
"go.opentelemetry.io/collector/config/configretry"
"go.opentelemetry.io/collector/receiver"
"go.opentelemetry.io/collector/scraper"

Expand All @@ -22,6 +24,7 @@ const (
defaultConcurrencyLimit = 50
defaultHTTPTimeout = 15 * time.Second
defaultMergedPRLookbackDays = 30
defaultMaxRetries = 10
)

type Factory struct{}
Expand All @@ -34,6 +37,16 @@ func (*Factory) CreateDefaultConfig() internal.Config {
ConcurrencyLimit: defaultConcurrencyLimit, // Default to 50 concurrent goroutines
MergedPRLookbackDays: defaultMergedPRLookbackDays,
MetricsBuilderConfig: metadata.DefaultMetricsBuilderConfig(),
RetryConfig: RetryConfig{
BackOffConfig: configretry.BackOffConfig{
Enabled: true,
InitialInterval: 1 * time.Second,
RandomizationFactor: backoff.DefaultRandomizationFactor,
Multiplier: backoff.DefaultMultiplier,
MaxInterval: 30 * time.Second,
},
MaxRetries: defaultMaxRetries,
},
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,20 @@ type githubScraper struct {
func (ghs *githubScraper) start(ctx context.Context, host component.Host) (err error) {
ghs.logger.Sugar().Info("starting the GitHub scraper")
ghs.client, err = ghs.cfg.ToClient(ctx, host.GetExtensions(), ghs.settings)
return err
if err != nil {
return err
}

// Wrap the transport with retry logic for transient GitHub API errors.
// Retries are bounded by the scrape context (cancelled at next collection
// interval).
ghs.client.Transport = &retryRoundTripper{
base: ghs.client.Transport,
cfg: ghs.cfg.RetryConfig,
logger: ghs.logger,
}

return nil
}

func newGitHubScraper(
Expand Down
133 changes: 133 additions & 0 deletions receiver/githubreceiver/internal/scraper/githubscraper/retry.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
// Copyright The OpenTelemetry Authors
// SPDX-License-Identifier: Apache-2.0

package githubscraper // import "github.com/open-telemetry/opentelemetry-collector-contrib/receiver/githubreceiver/internal/scraper/githubscraper"

import (
"io"
"net/http"
"strconv"
"time"

"github.com/cenkalti/backoff/v5"
"go.uber.org/zap"
)

// retryRoundTripper wraps an http.RoundTripper and retries on transient GitHub
// API errors (429, 502, 503, 504) and secondary rate limits (403 + Retry-After).
// Retries use exponential backoff with jitter and are bounded by MaxRetries,
// MaxElapsedTime, and the request context (cancelled when the scrape cycle ends).
type retryRoundTripper struct {
base http.RoundTripper
cfg RetryConfig
logger *zap.Logger
}

func (rt *retryRoundTripper) RoundTrip(req *http.Request) (*http.Response, error) {
resp, err := rt.base.RoundTrip(req)
if err != nil || !rt.cfg.Enabled {
return resp, err
}

if !isRetryable(resp) {
return resp, nil
}

b := &backoff.ExponentialBackOff{
InitialInterval: rt.cfg.InitialInterval,
RandomizationFactor: rt.cfg.RandomizationFactor,
Multiplier: rt.cfg.Multiplier,
MaxInterval: rt.cfg.MaxInterval,
}
b.Reset()

start := time.Now()
for attempt := 0; isRetryable(resp); attempt++ {
if rt.cfg.MaxRetries > 0 && attempt >= rt.cfg.MaxRetries {
break
}
if rt.cfg.MaxElapsedTime > 0 && time.Since(start) >= rt.cfg.MaxElapsedTime {
break
}

delay := b.NextBackOff()

// Honor Retry-After header from GitHub rate limits. The value is
// used as-is (not capped) because retrying before the server's
// requested delay just wastes an attempt. Context cancellation
// and MaxRetries already bound total retry behavior.
if ra := parseRetryAfter(resp.Header); ra > 0 {
delay = time.Duration(ra) * time.Second
b.Reset()
}

rt.logger.Debug("retrying GitHub API request",
zap.String("url", req.URL.String()),
zap.Int("status", resp.StatusCode),
zap.Int("attempt", attempt+1),
zap.Duration("backoff", delay),
)

// Drain and close the response body to reuse the TCP connection.
if _, drainErr := io.Copy(io.Discard, resp.Body); drainErr != nil {
rt.logger.Debug("failed to drain response body", zap.Error(drainErr))
}
if closeErr := resp.Body.Close(); closeErr != nil {
rt.logger.Debug("failed to close response body", zap.Error(closeErr))
}

// Wait for backoff or context cancellation.
timer := time.NewTimer(delay)
select {
case <-req.Context().Done():
timer.Stop()
return nil, req.Context().Err()
case <-timer.C:
}

// Reset request body for retry (genqlient uses bytes.NewReader which
// auto-sets GetBody, making POST bodies replayable).
if req.GetBody != nil {
req.Body, err = req.GetBody()
if err != nil {
return nil, err
}
}

resp, err = rt.base.RoundTrip(req)
if err != nil {
return resp, err
}
}

return resp, nil
}

// isRetryable returns true for HTTP status codes that indicate a transient
// GitHub API error worth retrying.
func isRetryable(resp *http.Response) bool {
switch resp.StatusCode {
case http.StatusTooManyRequests, // 429
http.StatusBadGateway, // 502
http.StatusServiceUnavailable, // 503
http.StatusGatewayTimeout: // 504
return true
case http.StatusForbidden: // 403 -- only with Retry-After (secondary rate limit)
return resp.Header.Get("Retry-After") != ""
}
return false
}

// parseRetryAfter extracts the delay in seconds from a Retry-After header.
// Returns 0 if the header is absent or not a valid integer.
func parseRetryAfter(h http.Header) int {
v := h.Get("Retry-After")
if v == "" {
return 0
}
seconds, err := strconv.Atoi(v)
if err != nil {
return 0
}
return seconds
}
Loading
Loading