66 "sync"
77 "sync/atomic"
88 "time"
9+
10+ "github.com/maypok86/otter"
911)
1012
1113type Error struct {
@@ -44,7 +46,7 @@ type HTTPClientSettings struct {
4446type CustomHTTPClient struct {
4547 interfacesWatcherStop chan bool
4648 WaitGroup * WaitGroupWithCount
47- dedupeHashTable * sync. Map
49+ dedupeHashTable * otter. Cache [ string , revisitRecord ]
4850 ErrChan chan * Error
4951 WARCWriter chan * RecordBatch
5052 interfacesWatcherStarted chan bool
@@ -60,6 +62,7 @@ type CustomHTTPClient struct {
6062 FullOnDisk bool
6163 DigestAlgorithm DigestAlgorithm
6264 closeDNSCache func ()
65+ closeDedupeCache func ()
6366 // MaxRAMUsageFraction is the fraction of system RAM above which we'll force spooling to disk. For example, 0.5 = 50%.
6467 // If set to <= 0, the default value is DefaultMaxRAMUsageFraction.
6568 MaxRAMUsageFraction float64
@@ -99,6 +102,7 @@ func (c *CustomHTTPClient) Close() error {
99102 }
100103
101104 c .closeDNSCache ()
105+ c .closeDedupeCache ()
102106
103107 return nil
104108}
@@ -132,7 +136,23 @@ func NewWARCWritingHTTPClient(HTTPClientSettings HTTPClientSettings) (httpClient
132136
133137 // Toggle deduplication options and create map for deduplication records.
134138 httpClient .dedupeOptions = HTTPClientSettings .DedupeOptions
135- httpClient .dedupeHashTable = new (sync.Map )
139+
140+ // Set default dedupe cache size to 1M entries if not specified
141+ dedupeCacheSize := HTTPClientSettings .DedupeOptions .DedupeCacheSize
142+ if dedupeCacheSize == 0 {
143+ dedupeCacheSize = 1_000_000
144+ }
145+
146+ dedupeCache , err := otter.MustBuilder [string , revisitRecord ](dedupeCacheSize ).Build ()
147+ if err != nil {
148+ return nil , err
149+ }
150+ httpClient .dedupeHashTable = & dedupeCache
151+
152+ httpClient .closeDedupeCache = func () {
153+ httpClient .dedupeHashTable .Close ()
154+ time .Sleep (1 * time .Second )
155+ }
136156
137157 // Set default deduplication threshold to 2048 bytes
138158 if httpClient .dedupeOptions .SizeThreshold == 0 {
0 commit comments