Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 15 additions & 11 deletions client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,18 @@ func newTestImageServer(t testing.TB, st int) *httptest.Server {
}))
}

func newTestImageTLSServer(t testing.TB, st int) *httptest.Server {
return httptest.NewTLSServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
fileBytes, err := os.ReadFile(path.Join("testdata", "image.svg"))
if err != nil {
t.Fatal(err)
}
w.Header().Set("Content-Type", "image/svg+xml")
w.WriteHeader(st)
w.Write(fileBytes)
}))
}

func (e *errorReadCloser) Read(p []byte) (int, error) {
if len(e.data) > 0 && e.readBefore > 0 {
// Read up to min(len(p), readBefore, len(data))
Expand Down Expand Up @@ -1387,7 +1399,7 @@ func TestHTTPClientWithSelfSignedCertificate(t *testing.T) {
)

// init test (self-signed) HTTPS endpoint
server := newTestImageServer(t, http.StatusOK)
server := newTestImageTLSServer(t, http.StatusOK)
defer server.Close()

// init the HTTP client responsible for recording HTTP(s) requests / responses
Expand Down Expand Up @@ -1420,6 +1432,7 @@ func TestHTTPClientWithSelfSignedCertificate(t *testing.T) {

for _, path := range files {
testFileSingleHashCheck(t, path, "sha1:UIRWL5DFIPQ4MX3D3GFHM2HCVU3TZ6I3", []string{"26872"}, 1, server.URL+"/")
testFileTLSHeaders(t, path)
os.Remove(path)
}
}
Expand All @@ -1431,16 +1444,7 @@ func TestWARCWritingWithDisallowedCertificate(t *testing.T) {
)

// init test (self-signed) HTTPS endpoint
server := httptest.NewTLSServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
fileBytes, err := os.ReadFile(path.Join("testdata", "image.svg"))
if err != nil {
t.Fatal(err)
}

w.WriteHeader(http.StatusTooManyRequests)
w.Header().Set("Content-Type", "image/svg+xml")
w.Write(fileBytes)
}))
server := newTestImageTLSServer(t, http.StatusOK)
defer server.Close()

// init the HTTP client responsible for recording HTTP(s) requests / responses
Expand Down
31 changes: 31 additions & 0 deletions dialer.go
Original file line number Diff line number Diff line change
Expand Up @@ -495,6 +495,29 @@ func (d *customDialer) writeWARCFromConnection(ctx context.Context, reqPipe, res
slices.Reverse(batch.Records)
}

var selectedCipherSuite string
var selectedProtocol string

if cc, ok := conn.(*tls.UConn); ok {
state := cc.ConnectionState()
// Use tls.CipherSuiteName for efficient lookup
selectedCipherSuite = tls.CipherSuiteName(state.CipherSuite)
// Add the negotiated protocol version
// Values as defined in WARC proposal https://github.com/iipc/warc-specifications/issues/42
switch state.Version {
case tls.VersionSSL30:
selectedProtocol = "ssl/3"
case tls.VersionTLS10:
selectedProtocol = "tls/1.0"
case tls.VersionTLS11:
selectedProtocol = "tls/1.1"
case tls.VersionTLS12:
selectedProtocol = "tls/1.2"
case tls.VersionTLS13:
selectedProtocol = "tls/1.3"
}
Comment on lines +507 to +518
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In addition to writing a WARC-Protocol value for the encryption used, it could be nice to add support for which HTTP verison was archived to have feature parity with wget-lua:

- http/0.9
- http/1.0
- http/1.1

Not a request for right now, but noted all the same

}

var warcTargetURI string
select {
case recv, ok := <-targetURIRespCh:
Expand All @@ -521,6 +544,14 @@ func (d *customDialer) writeWARCFromConnection(ctx context.Context, reqPipe, res

r.Header.Set("WARC-Record-ID", "<urn:uuid:"+recordIDs[i]+">")

if selectedCipherSuite != "" {
r.Header.Set("WARC-Cipher-Suite", selectedCipherSuite)
}

if selectedProtocol != "" {
r.Header.Set("WARC-Protocol", selectedProtocol)
}

if i == len(recordIDs)-1 {
r.Header.Set("WARC-Concurrent-To", "<urn:uuid:"+recordIDs[0]+">")
} else {
Expand Down
53 changes: 53 additions & 0 deletions read_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,59 @@ func testFileRevisitVailidity(t *testing.T, path string, originalTime string, or
}
}

// testFileTLSHeaders validates that WARC-Cipher-Suite and WARC-Protocol headers are present
// for request and response records when the connection was made over TLS.
func testFileTLSHeaders(t *testing.T, path string) {
file, err := os.Open(path)
if err != nil {
t.Fatalf("failed to open %q: %v", path, err)
}
defer file.Close()

t.Logf("checking 'WARC-Cipher-Suite' and 'WARC-Protocol' on %q", path)

reader, err := NewReader(file)
if err != nil {
t.Fatalf("warc.NewReader failed for %q: %v", path, err)
}

for {
record, err := reader.ReadRecord()
if err != nil {
if err == io.EOF {
break
}
t.Fatalf("warc.ReadRecord failed: %v", err)
break
}

recordType := record.Header.Get("WARC-Type")
if recordType == "request" || recordType == "response" {
cipherSuite := record.Header.Get("WARC-Cipher-Suite")
protocol := record.Header.Get("WARC-Protocol")

if cipherSuite == "" {
t.Errorf("WARC-Cipher-Suite header missing for %s record", recordType)
} else {
t.Logf("WARC-Cipher-Suite: %s", cipherSuite)
}

if protocol == "" {
t.Errorf("WARC-Protocol header missing for %s record", recordType)
} else if !strings.HasPrefix(protocol, "tls/") {
t.Errorf("WARC-Protocol should start with 'tls/' for HTTPS, got: %s", protocol)
Comment on lines +328 to +329
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the WARC-Protocol value is http/1.1 (which is valid per iipc/warc-specifications#42), then the test would incorrectly fail according to the spec, even if the point of this test is to look for a specific WARC-Protocol header

Given that we define type Header as a string to string map, we are not equipped to handle WARC-Protocol headers because the draft spec we're implementing supports multiple instances of the header, and attempting to write multiple headers right now would repeatedly overwrite the last header due to the map's unique key requirement. I don't think this limitation is the end of the world, but it should probably be noted in our readme? We have the same problem for WARC-Concurrent-To, and I'm not sure what our WARC verify command would report for a WARC with repeated headers

} else {
t.Logf("WARC-Protocol: %s", protocol)
}
}

err = record.Content.Close()
if err != nil {
t.Fatalf("failed to close record content: %v", err)
}
}
}

func testFileEarlyEOF(t *testing.T, path string) {
file, err := os.Open(path)
if err != nil {
Expand Down