Skip to content

Commit 8d29277

Browse files
authored
feat: update gowarc (#554)
* feat: update gowarc * feat: allow configuration of dedupe cache size * fix: oopsies
1 parent aa1fbdb commit 8d29277

File tree

5 files changed

+15
-5
lines changed

5 files changed

+15
-5
lines changed

cmd/get.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,7 @@ func addWARCFlags(getCmd *cobra.Command) {
125125
getCmd.PersistentFlags().Bool("cert-validation", false, "Enables certificate validation on HTTPS requests.")
126126
getCmd.PersistentFlags().Bool("disable-assets-capture", false, "Disable assets capture.")
127127
getCmd.PersistentFlags().Int("warc-dedupe-size", 1024, "Minimum size to deduplicate WARC records with revisit records.")
128+
getCmd.PersistentFlags().Int("warc-dedupe-cache-size", 1000000, "Maximum number of records to store in the local dedupe cache.")
128129
getCmd.PersistentFlags().String("warc-cdx-cookie", "", "Pass custom cookie during CDX requests. Example: 'cdx_auth_token=test_value'")
129130
getCmd.PersistentFlags().Int("warc-size", 1024, "Size of the WARC files in MB.")
130131
getCmd.PersistentFlags().IntSlice("warc-discard-status", []int{429}, "HTTP status codes to discard from WARC files. By default, 429 is always discarded.")

go.mod

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ require (
2222
github.com/grafov/m3u8 v0.12.1
2323
github.com/hashicorp/consul/api v1.33.2
2424
github.com/internetarchive/gocrawlhq v1.2.34
25-
github.com/internetarchive/gowarc v0.8.96
25+
github.com/internetarchive/gowarc v0.8.97
2626
github.com/ncruces/go-sqlite3 v0.30.5
2727
github.com/pdfcpu/pdfcpu v0.11.1
2828
github.com/philippgille/gokv/leveldb v0.7.0
@@ -72,14 +72,14 @@ require (
7272
github.com/hhrutter/pkcs7 v0.2.0 // indirect
7373
github.com/hhrutter/tiff v1.0.2 // indirect
7474
github.com/inconshreveable/mousetrap v1.1.0 // indirect
75-
github.com/klauspost/compress v1.18.1 // indirect
75+
github.com/klauspost/compress v1.18.3 // indirect
7676
github.com/klauspost/cpuid/v2 v2.0.12 // indirect
7777
github.com/lucasb-eyer/go-colorful v1.3.0 // indirect
7878
github.com/mattn/go-colorable v0.1.13 // indirect
7979
github.com/mattn/go-isatty v0.0.20 // indirect
8080
github.com/mattn/go-runewidth v0.0.19 // indirect
8181
github.com/maypok86/otter v1.2.4 // indirect
82-
github.com/miekg/dns v1.1.68 // indirect
82+
github.com/miekg/dns v1.1.72 // indirect
8383
github.com/mitchellh/go-homedir v1.1.0 // indirect
8484
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
8585
github.com/ncruces/julianday v1.0.0 // indirect
@@ -91,7 +91,7 @@ require (
9191
github.com/prometheus/client_model v0.6.2 // indirect
9292
github.com/prometheus/common v0.66.1 // indirect
9393
github.com/prometheus/procfs v0.16.1 // indirect
94-
github.com/refraction-networking/utls v1.8.1 // indirect
94+
github.com/refraction-networking/utls v1.8.2 // indirect
9595
github.com/rivo/uniseg v0.4.7 // indirect
9696
github.com/sagikazarmark/locafero v0.11.0 // indirect
9797
github.com/samber/lo v1.52.0 // indirect

go.sum

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,11 +169,15 @@ github.com/internetarchive/gocrawlhq v1.2.34 h1:Go1OWovBT72x+56OzekaqOD1oLx+sCpg
169169
github.com/internetarchive/gocrawlhq v1.2.34/go.mod h1:MmfqM9yeThJAbkKY1x5ntfGAGMEjc8mt/trnK4crW9M=
170170
github.com/internetarchive/gowarc v0.8.96 h1:MMw92JjOMscMByIGogvkfHrQTgTEGVvcfoPEw1/b84k=
171171
github.com/internetarchive/gowarc v0.8.96/go.mod h1:dB1LkgWMHl014TrTiEoiC/hDtT2yr4pHkPHemfsLp+I=
172+
github.com/internetarchive/gowarc v0.8.97 h1:ze95I7JVnq6sY2QvbLyicDIqVZn6Rhhun21XjlfCjRk=
173+
github.com/internetarchive/gowarc v0.8.97/go.mod h1:sduU+3bDInoomrTQ6tON1snXb1WCef6OsDJHt4LU3ao=
172174
github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU=
173175
github.com/json-iterator/go v1.1.9/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4=
174176
github.com/julienschmidt/httprouter v1.2.0/go.mod h1:SYymIcj16QtmaHHD7aYtjjsJG7VTCxuUUipMqKk8s4w=
175177
github.com/klauspost/compress v1.18.1 h1:bcSGx7UbpBqMChDtsF28Lw6v/G94LPrrbMbdC3JH2co=
176178
github.com/klauspost/compress v1.18.1/go.mod h1:ZQFFVG+MdnR0P+l6wpXgIL4NTtwiKIdBnrBd8Nrxr+0=
179+
github.com/klauspost/compress v1.18.3 h1:9PJRvfbmTabkOX8moIpXPbMMbYN60bWImDDU7L+/6zw=
180+
github.com/klauspost/compress v1.18.3/go.mod h1:R0h/fSBs8DE4ENlcrlib3PsXS61voFxhIs2DeRhCvJ4=
177181
github.com/klauspost/cpuid/v2 v2.0.12 h1:p9dKCg8i4gmOxtv35DvrYoWqYzQrvEVdjQ762Y0OqZE=
178182
github.com/klauspost/cpuid/v2 v2.0.12/go.mod h1:g2LTdtYhdyuGPqyWyv7qRAmj1WBqxuObKfj5c0PQa7c=
179183
github.com/konsorten/go-windows-terminal-sequences v1.0.1/go.mod h1:T0+1ngSBFLxvqU3pZ+m/2kptfBszLMUkC4ZK/EgS/cQ=
@@ -213,6 +217,8 @@ github.com/miekg/dns v1.1.26/go.mod h1:bPDLeHnStXmXAq1m/Ch/hvfNHr14JKNPMBo3VZKju
213217
github.com/miekg/dns v1.1.41/go.mod h1:p6aan82bvRIyn+zDIv9xYNUpwa73JcSh9BKwknJysuI=
214218
github.com/miekg/dns v1.1.68 h1:jsSRkNozw7G/mnmXULynzMNIsgY2dHC8LO6U6Ij2JEA=
215219
github.com/miekg/dns v1.1.68/go.mod h1:fujopn7TB3Pu3JM69XaawiU0wqjpL9/8xGop5UrTPps=
220+
github.com/miekg/dns v1.1.72 h1:vhmr+TF2A3tuoGNkLDFK9zi36F2LS+hKTRW0Uf8kbzI=
221+
github.com/miekg/dns v1.1.72/go.mod h1:+EuEPhdHOsfk6Wk5TT2CzssZdqkmFhf8r+aVyDEToIs=
216222
github.com/mitchellh/cli v1.1.0/go.mod h1:xcISNoH86gajksDmfB23e/pu+B+GeFRMYmoHXxx3xhI=
217223
github.com/mitchellh/go-homedir v1.1.0 h1:lukF9ziXFxDFPkA1vsr5zpc1XuPDn/wFntq5mG+4E0Y=
218224
github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0=
@@ -285,6 +291,8 @@ github.com/prometheus/procfs v0.16.1 h1:hZ15bTNuirocR6u0JZ6BAHHmwS1p8B4P6MRqxtzM
285291
github.com/prometheus/procfs v0.16.1/go.mod h1:teAbpZRB1iIAJYREa1LsoWUXykVXA1KlTmWl8x/U+Is=
286292
github.com/refraction-networking/utls v1.8.1 h1:yNY1kapmQU8JeM1sSw2H2asfTIwWxIkrMJI0pRUOCAo=
287293
github.com/refraction-networking/utls v1.8.1/go.mod h1:jkSOEkLqn+S/jtpEHPOsVv/4V4EVnelwbMQl4vCWXAM=
294+
github.com/refraction-networking/utls v1.8.2 h1:j4Q1gJj0xngdeH+Ox/qND11aEfhpgoEvV+S9iJ2IdQo=
295+
github.com/refraction-networking/utls v1.8.2/go.mod h1:jkSOEkLqn+S/jtpEHPOsVv/4V4EVnelwbMQl4vCWXAM=
288296
github.com/rivo/tview v0.42.0 h1:b/ftp+RxtDsHSaynXTbJb+/n/BxDEi+W3UfF5jILK6c=
289297
github.com/rivo/tview v0.42.0/go.mod h1:cSfIYfhpSGCjp3r/ECJb+GKS7cGJnqV8vfjQPwoXyfY=
290298
github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ=

internal/pkg/archiver/warc.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ func startWARCWriter() error {
2626
rotatorSettings.WarcinfoContent.Set("zeno-headless", "true")
2727
}
2828
// Configure WARC dedupe settings
29-
dedupeOptions := warc.DedupeOptions{LocalDedupe: !config.Get().DisableLocalDedupe, SizeThreshold: config.Get().WARCDedupeSize}
29+
dedupeOptions := warc.DedupeOptions{LocalDedupe: !config.Get().DisableLocalDedupe, SizeThreshold: config.Get().WARCDedupeSize, DedupeCacheSize: config.Get().WARCDedupeCacheSize}
3030
if config.Get().CDXDedupeServer != "" {
3131
dedupeOptions.CDXDedupe = true
3232
dedupeOptions.CDXURL = config.Get().CDXDedupeServer

internal/pkg/config/config.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ type Config struct {
5252
WARCPoolSize int `mapstructure:"warc-pool-size"`
5353
WARCQueueSize int `mapstructure:"warc-queue-size"`
5454
WARCDedupeSize int `mapstructure:"warc-dedupe-size"`
55+
WARCDedupeCacheSize int `mapstructure:"warc-dedupe-cache-size"`
5556
WARCWriteAsync bool `mapstructure:"async-warc-write"`
5657
WARCDiscardStatus []int `mapstructure:"warc-discard-status"`
5758
WARCDigestAlgorithm string `mapstructure:"warc-digest-algorithm"`

0 commit comments

Comments
 (0)