internetarchive
diff --git a/‎.github/workflows/go.yml‎
Lines changed: 37 additions & 4 deletions b/‎.github/workflows/go.yml‎
Lines changed: 37 additions & 4 deletions
diff --git a/‎client_test.go‎
Lines changed: 76 additions & 13 deletions b/‎client_test.go‎
Lines changed: 76 additions & 13 deletions
diff --git a/‎cmd/warc/mend/mend_test.go‎
Lines changed: 20 additions & 17 deletions b/‎cmd/warc/mend/mend_test.go‎
Lines changed: 20 additions & 17 deletions
@@ -11,8 +11,11 @@ permissions:
 
 jobs:
   build:
-    runs-on: ubuntu-latest
-    timeout-minutes: 5
+    runs-on: ${{ matrix.os }}
+    timeout-minutes: 15
+    strategy:
+      matrix:
+        os: [ubuntu-latest, macos-latest]
     steps:
     - uses: actions/checkout@v5
 
@@ -25,12 +28,42 @@ jobs:
       run: go build -v ./...
 
     - name: Goroutine leak detector
+      if: matrix.os == 'ubuntu-latest'
       continue-on-error: true
       run: go test -c -o tests && for test in $(go test -list . | grep -E "^(Test|Example)"); do ./tests -test.run "^$test\$" &>/dev/null && echo -e "$test passed\n" || echo -e "$test failed\n"; done
 
-    - name: Test
+    - name: Test (Full Suite)
+      if: matrix.os == 'ubuntu-latest'
       run: go test -race -v ./...
 
+    - name: Test (spooledtempfile only)
+      if: matrix.os == 'macos-latest'
+      run: go test -race -v ./pkg/spooledtempfile/...
+
     - name: Benchmarks
+      if: matrix.os == 'ubuntu-latest'
       run: go test -bench=. -benchmem -run=^$ ./...
-      
+
+    # Platform-specific test verification
+    - name: Test Linux-specific memory implementation
+      if: matrix.os == 'ubuntu-latest'
+      run: |
+        echo "Running Linux-specific memory tests..."
+        cd pkg/spooledtempfile
+        go test -v -run "TestCgroup|TestHostMeminfo|TestRead"
+
+    - name: Test macOS-specific memory implementation
+      if: matrix.os == 'macos-latest'
+      run: |
+        echo "Running macOS-specific memory tests..."
+        cd pkg/spooledtempfile
+        go test -v -run "TestGetSystemMemoryUsedFraction|TestSysctlMemoryValues|TestMemoryFractionConsistency"
+
+    # Cross-compilation verification
+    - name: Cross-compile for macOS (from Linux)
+      if: matrix.os == 'ubuntu-latest'
+      run: GOOS=darwin GOARCH=amd64 go build ./...
+
+    - name: Cross-compile for Linux (from macOS)
+      if: matrix.os == 'macos-latest'
+      run: GOOS=linux GOARCH=amd64 go build ./...
@@ -8,6 +8,7 @@ import (
 	"crypto/x509"
 	"crypto/x509/pkix"
 	"errors"
+	"fmt"
 	"io"
 	"math/big"
 	"net"
@@ -16,6 +17,7 @@ import (
 	"os"
 	"path"
 	"path/filepath"
+	"strconv"
 	"strings"
 	"sync"
 	"testing"
@@ -63,6 +65,46 @@ func defaultBenchmarkRotatorSettings(t *testing.B) *RotatorSettings {
 	return rotatorSettings
 }
 
+// sumRecordContentLengths returns the total Content-Length across all records in a WARC file.
+func sumRecordContentLengths(path string) (int64, error) {
+	file, err := os.Open(path)
+	if err != nil {
+		return 0, err
+	}
+	defer file.Close()
+
+	reader, err := NewReader(file)
+	if err != nil {
+		return 0, err
+	}
+
+	var total int64
+	for {
+		record, err := reader.ReadRecord()
+		if err != nil {
+			if err == io.EOF {
+				break
+			}
+			return 0, err
+		}
+
+		clStr := record.Header.Get("Content-Length")
+		cl, err := strconv.ParseInt(clStr, 10, 64)
+		if err != nil {
+			record.Content.Close()
+			return 0, fmt.Errorf("parsing Content-Length %q: %w", clStr, err)
+		}
+
+		total += cl
+
+		if err := record.Content.Close(); err != nil {
+			return 0, err
+		}
+	}
+
+	return total, nil
+}
+
 // Helper function used in many tests
 func drainErrChan(t *testing.T, errChan chan *Error) func() {
 	var wg sync.WaitGroup
@@ -153,21 +195,27 @@ func TestHTTPClient(t *testing.T) {
 		t.Fatal(err)
 	}
 
+	var expectedPayloadBytes int64
 	for _, path := range files {
 		testFileSingleHashCheck(t, path, "sha1:UIRWL5DFIPQ4MX3D3GFHM2HCVU3TZ6I3", []string{"26872"}, 1, server.URL+"/testdata/image.svg")
+
+		totalBytes, err := sumRecordContentLengths(path)
+		if err != nil {
+			t.Fatalf("failed to sum record content lengths for %s: %v", path, err)
+		}
+		expectedPayloadBytes += totalBytes
 	}
 
 	// verify that the remote dedupe count is correct
 	dataTotal := httpClient.DataTotal.Load()
-	if dataTotal < 27130 || dataTotal > 27160 {
-		t.Fatalf("total bytes downloaded mismatch, expected: 27130-27160 got: %d", dataTotal)
+	if dataTotal != expectedPayloadBytes {
+		t.Fatalf("total bytes downloaded mismatch, expected %d got %d", expectedPayloadBytes, dataTotal)
 	}
 }
 
 func TestHTTPClientRequestFailing(t *testing.T) {
 	var (
 		rotatorSettings = defaultRotatorSettings(t)
-		errWg           sync.WaitGroup
 		err             error
 	)
 
@@ -180,11 +228,14 @@ func TestHTTPClientRequestFailing(t *testing.T) {
 	if err != nil {
 		t.Fatalf("Unable to init WARC writing HTTP client: %s", err)
 	}
-	errWg.Add(1)
+
+	errCh := make(chan *Error, 1)
+	var errChWg sync.WaitGroup
+	errChWg.Add(1)
 	go func() {
-		defer errWg.Done()
-		for _ = range httpClient.ErrChan {
-			// We expect an error here, so we don't need to log it
+		defer errChWg.Done()
+		for err := range httpClient.ErrChan {
+			errCh <- err
 		}
 	}()
 
@@ -199,10 +250,21 @@ func TestHTTPClientRequestFailing(t *testing.T) {
 
 	_, err = httpClient.Do(req)
 	if err == nil {
-		t.Fatal("expected error on Do, got none")
+		select {
+		case recv := <-errCh:
+			if recv == nil {
+				t.Fatal("expected error via ErrChan but channel closed without value")
+			}
+		case <-time.After(2 * time.Second):
+			t.Fatal("expected error on Do or via ErrChan, got none")
+		}
+	} else {
+		t.Logf("got expected error: %v", err)
 	}
 
 	httpClient.Close()
+	errChWg.Wait()
+	close(errCh)
 }
 
 func TestHTTPClientConnReadDeadline(t *testing.T) {
@@ -594,15 +656,15 @@ func TestHTTPClientWithProxy(t *testing.T) {
 
 	// init socks5 proxy server
 	proxyServer := socks5.NewServer()
+	listener, err := net.Listen("tcp", "127.0.0.1:0")
+	if err != nil {
+		t.Fatalf("failed to listen for proxy: %v", err)
+	}
 
 	// Create a channel to signal server stop
 	stopChan := make(chan struct{})
 
 	go func() {
-		listener, err := net.Listen("tcp", "127.0.0.1:8000")
-		if err != nil {
-			panic(err)
-		}
 		defer listener.Close()
 
 		go func() {
@@ -615,6 +677,7 @@ func TestHTTPClientWithProxy(t *testing.T) {
 		}
 	}()
 
+	proxyAddr := listener.Addr().String()
 	// Defer sending the stop signal
 	defer close(stopChan)
 
@@ -625,7 +688,7 @@ func TestHTTPClientWithProxy(t *testing.T) {
 	// init the HTTP client responsible for recording HTTP(s) requests / responses
 	httpClient, err := NewWARCWritingHTTPClient(HTTPClientSettings{
 		RotatorSettings: rotatorSettings,
-		Proxy:           "socks5://127.0.0.1:8000"})
+		Proxy:           fmt.Sprintf("socks5://%s", proxyAddr)})
 	if err != nil {
 		t.Fatalf("Unable to init WARC writing HTTP client: %s", err)
 	}
 
@@ -6,15 +6,25 @@ import (
 	"io"
 	"os"
 	"path/filepath"
+	"runtime"
 	"testing"
 
 	"github.com/internetarchive/gowarc/cmd/warc/verify"
 	"github.com/spf13/cobra"
 )
 
+// getTestdataDir returns the path to the testdata directory, resolved relative to this test file.
+// This ensures tests work regardless of the working directory (e.g., from root, CI/CD, etc.).
+// Test file is at: cmd/warc/mend/mend_test.go, testdata is at: testdata/warcs
+// So we need to go up 3 levels from the test file.
+func getTestdataDir() string {
+	_, filename, _, _ := runtime.Caller(1)
+	return filepath.Join(filepath.Dir(filename), "../../../testdata/warcs")
+}
+
 // TestAnalyzeWARCFile tests the analysis of different WARC files
 func TestAnalyzeWARCFile(t *testing.T) {
-	testdataDir := "../../testdata/warcs"
+	testdataDir := getTestdataDir()
 
 	tests := []struct {
 		name            string
@@ -128,7 +138,7 @@ func TestAnalyzeWARCFile(t *testing.T) {
 
 // TestMendResultValidation tests that mendResult structs are properly populated
 func TestMendResultValidation(t *testing.T) {
-	testdataDir := "../../testdata/warcs"
+	testdataDir := getTestdataDir()
 
 	// Test a file that should have all fields populated
 	filePath := filepath.Join(testdataDir, "corrupted-trailing-bytes.warc.gz.open")
@@ -183,7 +193,7 @@ func TestMendResultValidation(t *testing.T) {
 
 // TestAnalyzeWARCFileForceMode tests analyzeWARCFile with force=true on good closed WARC files
 func TestAnalyzeWARCFileForceMode(t *testing.T) {
-	testdataDir := "../../testdata/warcs"
+	testdataDir := getTestdataDir()
 
 	tests := []struct {
 		name            string
@@ -255,7 +265,7 @@ func TestAnalyzeWARCFileForceMode(t *testing.T) {
 
 // TestSkipNonOpenFiles tests that non-.open files are correctly skipped
 func TestSkipNonOpenFiles(t *testing.T) {
-	testdataDir := "../../testdata/warcs"
+	testdataDir := getTestdataDir()
 	filePath := filepath.Join(testdataDir, "skip-non-open.warc.gz")
 
 	// Check if test file exists
@@ -305,7 +315,7 @@ var mendExpectedResults = map[string]expectedResult{
 		recordCount:   1, // Actual count from mend operation
 		truncateAt:    0, // No truncation needed
 		description:   "good synthetic file with .open suffix",
-		shouldBeValid: false, // File has WARC header corruption that mend can't fix
+		shouldBeValid: true, // After removing the .open suffix the WARC remains valid
 	},
 	"empty.warc.gz.open": {
 		outputFile:    "empty.warc.gz",
@@ -321,15 +331,15 @@ var mendExpectedResults = map[string]expectedResult{
 		recordCount:   1,    // Actual count from mend operation
 		truncateAt:    2362, // Truncates trailing garbage
 		description:   "synthetic file with trailing garbage bytes",
-		shouldBeValid: false, // File has WARC header corruption that mend can't fix
+		shouldBeValid: true, // Truncating the trailing garbage yields a valid WARC record
 	},
 	"corrupted-mid-record.warc.gz.open": {
 		outputFile:    "corrupted-mid-record.warc.gz",
 		sha256:        "7c7f896ce58404c841a652500efefbba5f4d92ccc6f9161b0b60aa816f542a7c",
 		recordCount:   1, // Actual count from mend operation
 		truncateAt:    1219,
 		description:   "synthetic file corrupted mid-record",
-		shouldBeValid: false, // File has WARC header corruption that mend can't fix
+		shouldBeValid: true, // Truncating back to the last valid position restores a valid record
 	},
 }
 
@@ -359,14 +369,7 @@ func createMockCobraCommand() *cobra.Command {
 // TestMendFunctionDirect verifies that the mend function produces
 // expected results on synthetic test data by comparing against pre-computed checksums
 func TestMendFunctionDirect(t *testing.T) {
-	// Get current directory and construct paths relative to workspace root
-	cwd, err := os.Getwd()
-	if err != nil {
-		t.Fatalf("failed to get current directory: %v", err)
-	}
-	// From cmd/mend, go up to workspace root
-	workspaceRoot := filepath.Join(cwd, "../..")
-	testdataDir := filepath.Join(workspaceRoot, "testdata/warcs")
+	testdataDir := getTestdataDir()
 	outputDir := filepath.Join(testdataDir, "mend_test_output")
 
 	// Ensure output directory exists
@@ -505,7 +508,7 @@ func copyFile(src, dst string) error {
 
 // TestIsGzipFile tests the gzip file detection function
 func TestIsGzipFile(t *testing.T) {
-	testdataDir := "../../testdata/warcs"
+	testdataDir := getTestdataDir()
 
 	tests := []struct {
 		name     string
@@ -643,7 +646,7 @@ func TestConfirmAction(t *testing.T) {
 
 // TestMendDryRun tests the mend function in dry-run mode
 func TestMendDryRun(t *testing.T) {
-	testdataDir := "../../testdata/warcs"
+	testdataDir := getTestdataDir()
 	tempDir, err := os.MkdirTemp("", "mend_dry_run_test_*")
 	if err != nil {
 		t.Fatalf("failed to create temp dir: %v", err)