Skip to content

Commit 5e4b4b6

Browse files
committed
MVP
1 parent 14bade2 commit 5e4b4b6

File tree

7 files changed

+289
-14
lines changed

7 files changed

+289
-14
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
main.go
2+
gocdx
3+
.vscode/
11.5 KB
Binary file not shown.

e2e/test/data/bad.warc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
totofooooo

generate.go

Lines changed: 201 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,206 @@
11
package gocdx
22

3-
import "io"
3+
import (
4+
"bufio"
5+
"crypto/sha1"
6+
"encoding/base32"
7+
"io"
8+
"strconv"
9+
"strings"
10+
"time"
11+
12+
"github.com/internetarchive/gocdx/pkg/surt"
13+
warc "github.com/internetarchive/gowarc"
14+
)
15+
16+
type overloadedWARCRecord struct {
17+
*warc.Record
18+
19+
compByteOffset int64
20+
compByteLength int64
21+
httpMessage string
22+
httpHeaders map[string]string
23+
warcFileName string
24+
}
425

526
// Generate reads a WARC file from the provided reader and returns a slice of Record generated from the given WARC records.
6-
func Generate(warc io.Reader, header string) ([]Record, error) {
7-
// This function is a placeholder for the actual implementation of generating
8-
// a CDX file from a WARC reader. The implementation would typically involve
9-
// reading the WARC records, extracting necessary fields, and formatting them
10-
// according to the CDX specification.
11-
12-
// For now, we will return an empty slice and nil error.
13-
return []Record{}, nil
27+
func Generate(warcFile io.ReadCloser, header string) ([]*Record, error) {
28+
warcReader, err := warc.NewReader(warcFile)
29+
if err != nil {
30+
return nil, err
31+
}
32+
33+
var currentPosition int64
34+
var warcFileName string
35+
var i int
36+
37+
var warcRecords []*overloadedWARCRecord
38+
for {
39+
warcRecord, size, err := warcReader.ReadRecord()
40+
if err != nil {
41+
return nil, err
42+
}
43+
if size == 0 {
44+
// EOF reached, no more records
45+
break
46+
}
47+
48+
if i == 0 && warcRecord.Header.Get("WARC-Filename") != "" {
49+
warcFileName = warcRecord.Header.Get("WARC-Filename")
50+
} else if i == 0 {
51+
// If the first record does not have a WARC-Filename, this is an error that we want to soft-fail
52+
warcFileName = "unknown.warc"
53+
}
54+
55+
var httpMessage string
56+
var httpHeaders map[string]string
57+
if warcRecord.Header.Get("WARC-Type") == "response" {
58+
httpMessage, httpHeaders = parseHTTPHeadersFromWARCRecord(warcRecord)
59+
}
60+
parsedWARCRecord := &overloadedWARCRecord{
61+
Record: warcRecord,
62+
compByteOffset: currentPosition,
63+
compByteLength: size,
64+
httpMessage: httpMessage,
65+
httpHeaders: httpHeaders,
66+
warcFileName: warcFileName,
67+
}
68+
69+
// spew.Dump(parsedWARCRecord)
70+
71+
warcRecords = append(warcRecords, parsedWARCRecord)
72+
parsedWARCRecord.Record.Content.Close() // Close the content to avoid memory leaks as we are not using it here
73+
74+
currentPosition += size
75+
}
76+
77+
// spew.Dump(warcRecords)
78+
79+
headerFields := strings.FieldsSeq(header)
80+
81+
records := make([]*Record, 0)
82+
for _, warcRecord := range warcRecords {
83+
if warcRecord.Record.Header.Get("WARC-Type") != "response" {
84+
// Only process records with WARC-Type "response"
85+
continue
86+
}
87+
88+
record := &Record{}
89+
for field := range headerFields {
90+
switch field {
91+
case "N":
92+
record.MassagedURL = surt.Massage(warcRecord.Record.Header.Get("WARC-Target-URI"))
93+
case "b":
94+
parsedTime, err := time.Parse(time.RFC3339, warcRecord.Record.Header.Get("WARC-Date"))
95+
if err != nil {
96+
parsedTime = time.Time{}
97+
}
98+
record.Timestamp = parsedTime
99+
case "a":
100+
record.OriginalURL = warcRecord.Record.Header.Get("WARC-Target-URI")
101+
case "m":
102+
record.MIMEType = strings.TrimSuffix(warcRecord.Record.Header.Get("Content-Type"), "; msgtype=response")
103+
case "s":
104+
record.StatusCode = -1
105+
106+
splittedHTTPMessage := strings.Split(warcRecord.httpMessage, " ")
107+
if len(splittedHTTPMessage) >= 2 {
108+
parsedStatusCode, err := strconv.Atoi(splittedHTTPMessage[1])
109+
if err == nil {
110+
record.StatusCode = parsedStatusCode
111+
}
112+
}
113+
case "k":
114+
trimmed := strings.TrimPrefix(warcRecord.Record.Header.Get("WARC-Block-Digest"), "sha1:")
115+
if trimmed != warcRecord.Record.Header.Get("WARC-Block-Digest") {
116+
record.NewStyleChecksum = trimmed
117+
} else {
118+
hasher := sha1.New()
119+
warcRecord.Record.Content.Seek(0, 0)
120+
io.Copy(hasher, warcRecord.Record.Content)
121+
record.NewStyleChecksum = base32.StdEncoding.EncodeToString(hasher.Sum(nil))
122+
}
123+
case "r":
124+
// TODO : clarify with whoever what to do with this field
125+
record.Redirect = "-"
126+
case "M":
127+
// TODO : let's ignore this field for now
128+
record.MetaTags = "-"
129+
case "S":
130+
record.CompressedRecordSize = warcRecord.compByteLength
131+
case "V":
132+
record.CompressedArcOffset = warcRecord.compByteOffset
133+
case "g":
134+
record.Filename = warcRecord.warcFileName
135+
}
136+
}
137+
records = append(records, record)
138+
}
139+
140+
return records, nil
141+
}
142+
143+
// FormatCDX formats a Record into a CDX string based on the header format.
144+
func (r *Record) FormatCDX(header string) (string, error) {
145+
var result strings.Builder
146+
headerFields := strings.FieldsSeq(header)
147+
148+
for field := range headerFields {
149+
switch field {
150+
case "N":
151+
result.WriteString(r.MassagedURL)
152+
case "b":
153+
result.WriteString(r.Timestamp.Format("20060102150405"))
154+
case "a":
155+
result.WriteString(r.OriginalURL)
156+
case "m":
157+
result.WriteString(r.MIMEType)
158+
case "s":
159+
result.WriteString(strconv.Itoa(r.StatusCode))
160+
case "k":
161+
result.WriteString(r.NewStyleChecksum)
162+
case "r":
163+
result.WriteString(r.Redirect)
164+
case "M":
165+
result.WriteString(r.MetaTags)
166+
case "S":
167+
result.WriteString(strconv.FormatInt(r.CompressedRecordSize, 10))
168+
case "V":
169+
result.WriteString(strconv.FormatInt(r.CompressedArcOffset, 10))
170+
case "g":
171+
result.WriteString(r.Filename)
172+
}
173+
result.WriteString(" ")
174+
}
175+
176+
return strings.TrimSpace(result.String()), nil
177+
}
178+
179+
func parseHTTPHeadersFromWARCRecord(warcRecord *warc.Record) (message string, headers map[string]string) {
180+
headers = make(map[string]string)
181+
182+
var i int
183+
scanner := bufio.NewScanner(warcRecord.Content)
184+
for scanner.Scan() {
185+
line := scanner.Text()
186+
if line == "" {
187+
break
188+
}
189+
190+
// handle HTTP message
191+
if i == 0 {
192+
message = strings.Clone(line)
193+
i++
194+
continue
195+
}
196+
197+
parts := strings.SplitN(line, ": ", 2)
198+
if len(parts) == 2 {
199+
headers[strings.ToLower(parts[0])] = strings.ToLower(parts[1])
200+
}
201+
202+
i++
203+
}
204+
205+
return
14206
}

go.mod

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,30 @@
11
module github.com/internetarchive/gocdx
22

3-
go 1.23.1
3+
go 1.24.2
4+
5+
replace github.com/internetarchive/gowarc => ../gowarc
6+
7+
require (
8+
github.com/davecgh/go-spew v1.1.1
9+
github.com/internetarchive/gowarc v0.8.85
10+
)
11+
12+
require (
13+
github.com/ImVexed/fasturl v0.0.0-20230304231329-4e41488060f3 // indirect
14+
github.com/andybalholm/brotli v1.1.1 // indirect
15+
github.com/cloudflare/circl v1.6.1 // indirect
16+
github.com/dolthub/maphash v0.1.0 // indirect
17+
github.com/gammazero/deque v1.0.0 // indirect
18+
github.com/google/uuid v1.6.0 // indirect
19+
github.com/klauspost/compress v1.18.0 // indirect
20+
github.com/maypok86/otter v1.2.4 // indirect
21+
github.com/miekg/dns v1.1.65 // indirect
22+
github.com/refraction-networking/utls v1.6.7 // indirect
23+
github.com/ulikunitz/xz v0.5.12 // indirect
24+
golang.org/x/crypto v0.37.0 // indirect
25+
golang.org/x/mod v0.24.0 // indirect
26+
golang.org/x/net v0.39.0 // indirect
27+
golang.org/x/sync v0.13.0 // indirect
28+
golang.org/x/sys v0.32.0 // indirect
29+
golang.org/x/tools v0.32.0 // indirect
30+
)

go.sum

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
github.com/ImVexed/fasturl v0.0.0-20230304231329-4e41488060f3 h1:ClzzXMDDuUbWfNNZqGeYq4PnYOlwlOVIvSyNaIy0ykg=
2+
github.com/ImVexed/fasturl v0.0.0-20230304231329-4e41488060f3/go.mod h1:we0YA5CsBbH5+/NUzC/AlMmxaDtWlXeNsqrwXjTzmzA=
3+
github.com/andybalholm/brotli v1.1.1 h1:PR2pgnyFznKEugtsUo0xLdDop5SKXd5Qf5ysW+7XdTA=
4+
github.com/andybalholm/brotli v1.1.1/go.mod h1:05ib4cKhjx3OQYUY22hTVd34Bc8upXjOLL2rKwwZBoA=
5+
github.com/cloudflare/circl v1.6.1 h1:zqIqSPIndyBh1bjLVVDHMPpVKqp8Su/V+6MeDzzQBQ0=
6+
github.com/cloudflare/circl v1.6.1/go.mod h1:uddAzsPgqdMAYatqJ0lsjX1oECcQLIlRpzZh3pJrofs=
7+
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
8+
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
9+
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
10+
github.com/dolthub/maphash v0.1.0 h1:bsQ7JsF4FkkWyrP3oCnFJgrCUAFbFf3kOl4L/QxPDyQ=
11+
github.com/dolthub/maphash v0.1.0/go.mod h1:gkg4Ch4CdCDu5h6PMriVLawB7koZ+5ijb9puGMV50a4=
12+
github.com/dvyukov/go-fuzz v0.0.0-20200318091601-be3528f3a813/go.mod h1:11Gm+ccJnvAhCNLlf5+cS9KjtbaD5I5zaZpFMsTHWTw=
13+
github.com/gammazero/deque v1.0.0 h1:LTmimT8H7bXkkCy6gZX7zNLtkbz4NdS2z8LZuor3j34=
14+
github.com/gammazero/deque v1.0.0/go.mod h1:iflpYvtGfM3U8S8j+sZEKIak3SAKYpA5/SQewgfXDKo=
15+
github.com/google/go-cmp v0.6.0 h1:ofyhxvXcZhMsU5ulbFiLKl/XBFqE1GSq7atu8tAmTRI=
16+
github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
17+
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
18+
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
19+
github.com/klauspost/compress v1.18.0 h1:c/Cqfb0r+Yi+JtIEq73FWXVkRonBlf0CRNYc8Zttxdo=
20+
github.com/klauspost/compress v1.18.0/go.mod h1:2Pp+KzxcywXVXMr50+X0Q/Lsb43OQHYWRCY2AiWywWQ=
21+
github.com/maypok86/otter v1.2.4 h1:HhW1Pq6VdJkmWwcZZq19BlEQkHtI8xgsQzBVXJU0nfc=
22+
github.com/maypok86/otter v1.2.4/go.mod h1:mKLfoI7v1HOmQMwFgX4QkRk23mX6ge3RDvjdHOWG4R4=
23+
github.com/miekg/dns v1.1.65 h1:0+tIPHzUW0GCge7IiK3guGP57VAw7hoPDfApjkMD1Fc=
24+
github.com/miekg/dns v1.1.65/go.mod h1:Dzw9769uoKVaLuODMDZz9M6ynFU6Em65csPuoi8G0ck=
25+
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
26+
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
27+
github.com/refraction-networking/utls v1.6.7 h1:zVJ7sP1dJx/WtVuITug3qYUq034cDq9B2MR1K67ULZM=
28+
github.com/refraction-networking/utls v1.6.7/go.mod h1:BC3O4vQzye5hqpmDTWUqi4P5DDhzJfkV1tdqtawQIH0=
29+
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
30+
github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
31+
github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
32+
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
33+
github.com/things-go/go-socks5 v0.0.6 h1:YjylIYZiND41szH4NzsVbx8aVDsS/Y8ps3QYPwQvqnI=
34+
github.com/things-go/go-socks5 v0.0.6/go.mod h1:RF6tRutwNWzISbPfiDEChH/o1aDfRv+cXDYn2a2qkK4=
35+
github.com/ulikunitz/xz v0.5.12 h1:37Nm15o69RwBkXM0J6A5OlE67RZTfzUxTj8fB3dfcsc=
36+
github.com/ulikunitz/xz v0.5.12/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14=
37+
github.com/xyproto/randomstring v1.0.5 h1:YtlWPoRdgMu3NZtP45drfy1GKoojuR7hmRcnhZqKjWU=
38+
github.com/xyproto/randomstring v1.0.5/go.mod h1:rgmS5DeNXLivK7YprL0pY+lTuhNQW3iGxZ18UQApw/E=
39+
go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
40+
go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
41+
golang.org/x/crypto v0.37.0 h1:kJNSjF/Xp7kU0iB2Z+9viTPMW4EqqsrywMXLJOOsXSE=
42+
golang.org/x/crypto v0.37.0/go.mod h1:vg+k43peMZ0pUMhYmVAWysMK35e6ioLh3wB8ZCAfbVc=
43+
golang.org/x/mod v0.24.0 h1:ZfthKaKaT4NrhGVZHO1/WDTwGES4De8KtWO0SIbNJMU=
44+
golang.org/x/mod v0.24.0/go.mod h1:IXM97Txy2VM4PJ3gI61r1YEk/gAj6zAHN3AdZt6S9Ww=
45+
golang.org/x/net v0.39.0 h1:ZCu7HMWDxpXpaiKdhzIfaltL9Lp31x/3fCP11bc6/fY=
46+
golang.org/x/net v0.39.0/go.mod h1:X7NRbYVEA+ewNkCNyJ513WmMdQ3BineSwVtN2zD/d+E=
47+
golang.org/x/sync v0.13.0 h1:AauUjRAJ9OSnvULf/ARrrVywoJDy0YS2AwQ98I37610=
48+
golang.org/x/sync v0.13.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA=
49+
golang.org/x/sys v0.32.0 h1:s77OFDvIQeibCmezSnk/q6iAfkdiQaJi4VzroCFrN20=
50+
golang.org/x/sys v0.32.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k=
51+
golang.org/x/tools v0.32.0 h1:Q7N1vhpkQv7ybVzLFtTjvQya2ewbwNDZzUgfXGqtMWU=
52+
golang.org/x/tools v0.32.0/go.mod h1:ZxrU41P/wAbZD8EDa6dDCa6XfpkhJ7HFMjHJXfBDu8s=
53+
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
54+
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
55+
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
56+
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

pkg/surt/massage.go

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,18 +30,14 @@ func Massage(url string, opts ...MassageOpts) (massagedURL string) {
3030
}
3131
}
3232

33-
// Keep certain non-hierarchical / special schemes exactly as-is.
3433
if isOpaquePassThrough(pu.Protocol) {
3534
return url
3635
}
3736

38-
// Opaque schemes (mailto:), or custom like dns:, warcinfo:, filedesc: must be returned verbatim.
3937
if isNonHierarchical(pu.Protocol) {
4038
return url
4139
}
4240

43-
// WHOIS behaves like a hierarchical scheme for these tests.
44-
// For http/https/ftp/whois, produce SURT.
4541
host := strings.ToLower(pu.Host)
4642
if massageHost {
4743
host, _ = strings.CutPrefix(host, "www.")

0 commit comments

Comments
 (0)