Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 67 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,73 @@ The library handles DNS resolution differently depending on the connection type:

**Important for Privacy**: When using `socks5h://` or other remote DNS proxies, your local DNS servers will not see any queries for the target domains, maintaining better privacy and anonymity.


### Metrics and Observability

`gowarc` provides a `StatsRegistry` interface that allows you to integrate your own metrics collection system (Prometheus, Datadog, etc.). The library tracks various metrics including data written, deduplication statistics, and proxy usage.

#### Using the StatsRegistry Interface

The `StatsRegistry` interface can be found in [`stats.go`](stats.go). To implement your own metrics collection:

```go
// Implement the StatsRegistry interface
type MyPrometheusRegistry struct {
// Your Prometheus registry fields
}

func (r *MyPrometheusRegistry) RegisterCounter(name, help string, labelNames []string) warc.Counter {
// Return a Counter that wraps your Prometheus counter
// The Counter interface requires WithLabels() method for dimensional metrics
}

func (r *MyPrometheusRegistry) RegisterGauge(name, help string, labelNames []string) warc.Gauge {
// Return a Gauge that wraps your Prometheus gauge
}

func (r *MyPrometheusRegistry) RegisterHistogram(name, help string, buckets []int64, labelNames []string) warc.Histogram {
// Return a Histogram that wraps your Prometheus histogram
}

// Pass your registry to the HTTP client
clientSettings := warc.HTTPClientSettings{
StatsRegistry: &MyPrometheusRegistry{},
// ... other settings
}
```

#### Available Metrics

The library tracks the following metrics:

- **`total_data_written`**: Total bytes written to WARC files
- **`local_deduped_bytes_total`**: Bytes saved through local deduplication
- **`local_deduped_total`**: Number of records deduplicated locally
- **`doppelganger_deduped_bytes_total`**: Bytes saved through Doppelganger deduplication
- **`doppelganger_deduped_total`**: Number of records deduplicated via Doppelganger
- **`cdx_deduped_bytes_total`**: Bytes saved through CDX deduplication
- **`cdx_deduped_total`**: Number of records deduplicated via CDX
- **`proxy_requests_total`**: Total requests through each proxy (with `proxy` label)
- **`proxy_errors_total`**: Total errors for each proxy (with `proxy` label)
- **`proxy_last_used_nanoseconds`**: Last usage timestamp for each proxy (with `proxy` label)

#### Label Support

Metrics support Prometheus-style labels for dimensional data:

```go
// Register a counter with label dimensions
counter := registry.RegisterCounter("http_requests_total", "Total HTTP requests", []string{"method", "status"})

// Record metrics with specific label values
counter.WithLabels(warc.Labels{"method": "GET", "status": "200"}).Inc()
counter.WithLabels(warc.Labels{"method": "POST", "status": "201"}).Add(5)

// Each unique label combination creates a separate metric series
```

**Interface Details**: See the complete interface contract in [`stats.go`](stats.go) for full implementation requirements.

## CLI Tools

In addition to the Go library, gowarc provides several command-line utilities for working with WARC files:
Expand Down
77 changes: 56 additions & 21 deletions client.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ import (
"net/http"
"os"
"sync"
"sync/atomic"
"time"
)

Expand All @@ -13,9 +12,52 @@ type Error struct {
Func string
}

// ProxyNetwork defines the network layer (IPv4/IPv6) a proxy can support
type ProxyNetwork int

const (
// ProxyNetworkUnset is the zero value and must not be used - forces explicit selection
ProxyNetworkUnset ProxyNetwork = iota
// ProxyNetworkAny means the proxy can be used for both IPv4 and IPv6 connections
ProxyNetworkAny
// ProxyNetworkIPv4 means the proxy should only be used for IPv4 connections
ProxyNetworkIPv4
// ProxyNetworkIPv6 means the proxy should only be used for IPv6 connections
ProxyNetworkIPv6
)

// ProxyType defines the infrastructure type of a proxy
type ProxyType int

const (
// ProxyTypeAny means the proxy can be used for any type of request
ProxyTypeAny ProxyType = iota
// ProxyTypeMobile means the proxy uses mobile network infrastructure
ProxyTypeMobile
// ProxyTypeResidential means the proxy uses residential IP addresses
ProxyTypeResidential
// ProxyTypeDatacenter means the proxy uses datacenter infrastructure
ProxyTypeDatacenter
)

// ProxyConfig defines the configuration for a single proxy
type ProxyConfig struct {
// URL is the proxy URL (e.g., "socks5://proxy.example.com:1080")
URL string
// Network specifies if this proxy supports IPv4, IPv6, or both
Network ProxyNetwork
// Type specifies the infrastructure type (Mobile, Residential, Datacenter, or Any)
Type ProxyType
// AllowedDomains is a list of glob patterns for domains this proxy should handle
// Examples: "*.example.com", "api.*.org"
// If empty, the proxy can be used for any domain
AllowedDomains []string
}

type HTTPClientSettings struct {
RotatorSettings *RotatorSettings
Proxy string
Proxies []ProxyConfig
AllowDirectFallback bool
TempDir string
DiscardHook DiscardHook
DNSServers []string
Expand All @@ -39,6 +81,7 @@ type HTTPClientSettings struct {
DisableIPv6 bool
IPv6AnyIP bool
DigestAlgorithm DigestAlgorithm
StatsRegistry StatsRegistry
}

type CustomHTTPClient struct {
Expand All @@ -64,15 +107,8 @@ type CustomHTTPClient struct {
// If set to <= 0, the default value is DefaultMaxRAMUsageFraction.
MaxRAMUsageFraction float64
randomLocalIP bool
DataTotal *atomic.Int64

CDXDedupeTotalBytes *atomic.Int64
DoppelgangerDedupeTotalBytes *atomic.Int64
LocalDedupeTotalBytes *atomic.Int64

CDXDedupeTotal *atomic.Int64
DoppelgangerDedupeTotal *atomic.Int64
LocalDedupeTotal *atomic.Int64
statsRegistry StatsRegistry
}

func (c *CustomHTTPClient) Close() error {
Expand Down Expand Up @@ -106,16 +142,15 @@ func (c *CustomHTTPClient) Close() error {
func NewWARCWritingHTTPClient(HTTPClientSettings HTTPClientSettings) (httpClient *CustomHTTPClient, err error) {
httpClient = new(CustomHTTPClient)

// Initialize counters
httpClient.DataTotal = &DataTotal

httpClient.CDXDedupeTotalBytes = &CDXDedupeTotalBytes
httpClient.DoppelgangerDedupeTotalBytes = &DoppelgangerDedupeTotalBytes
httpClient.LocalDedupeTotalBytes = &LocalDedupeTotalBytes

httpClient.CDXDedupeTotal = &CDXDedupeTotal
httpClient.DoppelgangerDedupeTotal = &DoppelgangerDedupeTotal
httpClient.LocalDedupeTotal = &LocalDedupeTotal
// Initialize stats registry
if HTTPClientSettings.StatsRegistry != nil {
httpClient.statsRegistry = HTTPClientSettings.StatsRegistry
HTTPClientSettings.RotatorSettings.StatsRegistry = HTTPClientSettings.StatsRegistry
} else {
localStatsRegistry := newLocalRegistry()
httpClient.statsRegistry = localStatsRegistry
HTTPClientSettings.RotatorSettings.StatsRegistry = localStatsRegistry
}

// Configure random local IP
httpClient.randomLocalIP = HTTPClientSettings.RandomLocalIP
Expand Down Expand Up @@ -216,7 +251,7 @@ func NewWARCWritingHTTPClient(HTTPClientSettings HTTPClientSettings) (httpClient
httpClient.ConnReadDeadline = HTTPClientSettings.ConnReadDeadline

// Configure custom dialer / transport
customDialer, err := newCustomDialer(httpClient, HTTPClientSettings.Proxy, HTTPClientSettings.DialTimeout, HTTPClientSettings.DNSRecordsTTL, HTTPClientSettings.DNSResolutionTimeout, HTTPClientSettings.DNSCacheSize, HTTPClientSettings.DNSServers, HTTPClientSettings.DNSConcurrency, HTTPClientSettings.DisableIPv4, HTTPClientSettings.DisableIPv6)
customDialer, err := newCustomDialer(httpClient, HTTPClientSettings.Proxies, HTTPClientSettings.AllowDirectFallback, HTTPClientSettings.DialTimeout, HTTPClientSettings.DNSRecordsTTL, HTTPClientSettings.DNSResolutionTimeout, HTTPClientSettings.DNSCacheSize, HTTPClientSettings.DNSServers, HTTPClientSettings.DNSConcurrency, HTTPClientSettings.DisableIPv4, HTTPClientSettings.DisableIPv6)
if err != nil {
return nil, err
}
Expand Down
85 changes: 26 additions & 59 deletions client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -160,9 +160,6 @@ func TestHTTPClient(t *testing.T) {
err error
)

// Reset counter to 0
DataTotal.Store(0)

// init test HTTP endpoint
server := newTestImageServer(t, http.StatusOK)
defer server.Close()
Expand Down Expand Up @@ -207,7 +204,7 @@ func TestHTTPClient(t *testing.T) {
}

// verify that the remote dedupe count is correct
dataTotal := httpClient.DataTotal.Load()
dataTotal := httpClient.statsRegistry.RegisterCounter(totalDataWritten, totalDataWrittenHelp, nil).WithLabels(nil).Get()
if dataTotal != expectedPayloadBytes {
t.Fatalf("total bytes downloaded mismatch, expected %d got %d", expectedPayloadBytes, dataTotal)
}
Expand Down Expand Up @@ -688,7 +685,13 @@ func TestHTTPClientWithProxy(t *testing.T) {
// init the HTTP client responsible for recording HTTP(s) requests / responses
httpClient, err := NewWARCWritingHTTPClient(HTTPClientSettings{
RotatorSettings: rotatorSettings,
Proxy: fmt.Sprintf("socks5://%s", proxyAddr)})
Proxies: []ProxyConfig{
{
URL: fmt.Sprintf("socks5://%s", proxyAddr),
Network: ProxyNetworkAny,
Type: ProxyTypeAny,
},
}})
if err != nil {
t.Fatalf("Unable to init WARC writing HTTP client: %s", err)
}
Expand Down Expand Up @@ -846,10 +849,6 @@ func TestHTTPClientLocalDedupe(t *testing.T) {
err error
)

// Reset counter to 0
LocalDedupeTotal.Store(0)
LocalDedupeTotalBytes.Store(0)

// init test HTTP endpoint
server := newTestImageServer(t, http.StatusOK)
defer server.Close()
Expand Down Expand Up @@ -897,18 +896,13 @@ func TestHTTPClientLocalDedupe(t *testing.T) {
}

// verify that the local dedupe count is correct
if LocalDedupeTotalBytes.Load() != 26872 {
t.Fatalf("local dedupe total bytes mismatch, expected: 26872 got: %d", LocalDedupeTotalBytes.Load())
}

// Ensure that HTTP client results work correctly as well
if httpClient.LocalDedupeTotalBytes.Load() != 26872 {
t.Fatalf("local dedupe total bytes mismatch, expected: 26872 got: %d", httpClient.LocalDedupeTotalBytes.Load())
if httpClient.statsRegistry.RegisterCounter(localDedupedBytesTotal, localDedupedBytesTotalHelp, nil).WithLabels(nil).Get() != 26872 {
t.Fatalf("local dedupe total bytes mismatch, expected: 26872 got: %d", httpClient.statsRegistry.RegisterCounter(localDedupedBytesTotal, localDedupedBytesTotalHelp, nil).WithLabels(nil).Get())
}

// 1 is expected due to requiring one request to enter into the table.
if httpClient.LocalDedupeTotal.Load() != 1 {
t.Fatalf("local dedupe total mismatch, expected: 1 got: %d", httpClient.LocalDedupeTotal.Load())
if httpClient.statsRegistry.RegisterCounter(localDedupedTotal, localDedupedTotalHelp, nil).WithLabels(nil).Get() != 1 {
t.Fatalf("local dedupe total mismatch, expected: 1 got: %d", httpClient.statsRegistry.RegisterCounter(localDedupedTotal, localDedupedTotalHelp, nil).WithLabels(nil).Get())
}
}

Expand All @@ -922,10 +916,6 @@ func TestHTTPClientRemoteDedupe(t *testing.T) {
// init test HTTP endpoint
mux := http.NewServeMux()

// Reset counter to 0
CDXDedupeTotal.Store(0)
CDXDedupeTotalBytes.Store(0)

mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
fileBytes, err := os.ReadFile(path.Join("testdata", "image.svg"))
if err != nil {
Expand Down Expand Up @@ -990,18 +980,13 @@ func TestHTTPClientRemoteDedupe(t *testing.T) {
testFileRevisitVailidity(t, path, "2022-03-20T00:25:18Z", "sha1:UIRWL5DFIPQ4MX3D3GFHM2HCVU3TZ6I3", false)
}

// verify that the remote dedupe count is correct
if CDXDedupeTotalBytes.Load() != 107488 {
t.Fatalf("remote dedupe total bytes mismatch, expected: 107488 got: %d", CDXDedupeTotalBytes.Load())
}

// Ensure that HTTP client results work correctly as well
if httpClient.CDXDedupeTotalBytes.Load() != 107488 {
t.Fatalf("remote dedupe total bytes mismatch, expected: 107488 got: %d", httpClient.CDXDedupeTotalBytes.Load())
// verify that the CDX dedupe count is correct
if httpClient.statsRegistry.RegisterCounter(cdxDedupedBytesTotal, cdxDedupedBytesTotalHelp, nil).WithLabels(nil).Get() != 107488 {
t.Fatalf("CDX dedupe total bytes mismatch, expected: 26872 got: %d", httpClient.statsRegistry.RegisterCounter(cdxDedupedBytesTotal, cdxDedupedBytesTotalHelp, nil).WithLabels(nil).Get())
}

if httpClient.CDXDedupeTotal.Load() != 4 {
t.Fatalf("remote dedupe total mismatch, expected: 4 got: %d", httpClient.CDXDedupeTotal.Load())
if httpClient.statsRegistry.RegisterCounter(cdxDedupedTotal, cdxDedupedTotalHelp, nil).WithLabels(nil).Get() != 4 {
t.Fatalf("CDX dedupe total mismatch, expected: 1 got: %d", httpClient.statsRegistry.RegisterCounter(cdxDedupedTotal, cdxDedupedTotalHelp, nil).WithLabels(nil).Get())
}
}

Expand All @@ -1016,10 +1001,6 @@ func TestHTTPClientDoppelgangerDedupe(t *testing.T) {
// init test HTTP endpoint
mux := http.NewServeMux()

// Reset counter to 0
DoppelgangerDedupeTotal.Store(0)
DoppelgangerDedupeTotalBytes.Store(0)

mux.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
fileBytes, err := os.ReadFile(path.Join("testdata", "image.svg"))
if err != nil {
Expand Down Expand Up @@ -1089,18 +1070,13 @@ func TestHTTPClientDoppelgangerDedupe(t *testing.T) {
testFileRevisitVailidity(t, path, "2022-03-20T00:25:18Z", "sha1:UIRWL5DFIPQ4MX3D3GFHM2HCVU3TZ6I3", false)
}

// verify that the remote dedupe count is correct
if DoppelgangerDedupeTotalBytes.Load() != 107488 {
t.Fatalf("remote dedupe total bytes mismatch, expected: 107488 got: %d", DoppelgangerDedupeTotalBytes.Load())
}

// Ensure that HTTP client results work correctly as well
if httpClient.DoppelgangerDedupeTotalBytes.Load() != 107488 {
t.Fatalf("remote dedupe total bytes mismatch, expected: 107488 got: %d", httpClient.DoppelgangerDedupeTotalBytes.Load())
// verify that the Doppelganger count is correct
if httpClient.statsRegistry.RegisterCounter(doppelgangerDedupedBytesTotal, doppelgangerDedupedBytesTotalHelp, nil).WithLabels(nil).Get() != 107488 {
t.Fatalf("Doppelganger total bytes mismatch, expected: 26872 got: %d", httpClient.statsRegistry.RegisterCounter(doppelgangerDedupedBytesTotal, doppelgangerDedupedBytesTotalHelp, nil).WithLabels(nil).Get())
}

if httpClient.DoppelgangerDedupeTotal.Load() != 4 {
t.Fatalf("remote dedupe total mismatch, expected: 4 got: %d", httpClient.DoppelgangerDedupeTotal.Load())
if httpClient.statsRegistry.RegisterCounter(doppelgangerDedupedTotal, doppelgangerDedupedTotalHelp, nil).WithLabels(nil).Get() != 4 {
t.Fatalf("Doppelganger total mismatch, expected: 1 got: %d", httpClient.statsRegistry.RegisterCounter(doppelgangerDedupedTotal, doppelgangerDedupedTotalHelp, nil).WithLabels(nil).Get())
}
}

Expand All @@ -1110,10 +1086,6 @@ func TestHTTPClientDedupeEmptyPayload(t *testing.T) {
err error
)

// Reset counter to 0
LocalDedupeTotal.Store(0)
LocalDedupeTotalBytes.Store(0)

// init test HTTP endpoint
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
// Empty. This is intentional to mirror 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ.
Expand Down Expand Up @@ -1167,17 +1139,12 @@ func TestHTTPClientDedupeEmptyPayload(t *testing.T) {
}

// verify that the local dedupe count is correct
if LocalDedupeTotalBytes.Load() != 0 {
t.Fatalf("local dedupe total bytes mismatch, expected: 0 got: %d", LocalDedupeTotalBytes.Load())
}

// Ensure that HTTP client results work correctly as well
if httpClient.LocalDedupeTotalBytes.Load() != 0 {
t.Fatalf("local dedupe total bytes mismatch, expected: 0 got: %d", httpClient.LocalDedupeTotalBytes.Load())
if httpClient.statsRegistry.RegisterCounter(localDedupedBytesTotal, localDedupedBytesTotalHelp, nil).WithLabels(nil).Get() != 0 {
t.Fatalf("local dedupe total bytes mismatch, expected: 26872 got: %d", httpClient.statsRegistry.RegisterCounter(localDedupedBytesTotal, localDedupedBytesTotalHelp, nil).WithLabels(nil).Get())
}

if httpClient.LocalDedupeTotal.Load() != 0 {
t.Fatalf("local dedupe total mismatch, expected: 0 got: %d", httpClient.LocalDedupeTotal.Load())
if httpClient.statsRegistry.RegisterCounter(localDedupedTotal, localDedupedTotalHelp, nil).WithLabels(nil).Get() != 0 {
t.Fatalf("local dedupe total mismatch, expected: 1 got: %d", httpClient.statsRegistry.RegisterCounter(localDedupedTotal, localDedupedTotalHelp, nil).WithLabels(nil).Get())
}
}

Expand Down
Loading
Loading