Skip to content

Commit 6d48952

Browse files
authored
Update to latest version of gocrawlhq (#156)
* Initial HQv3 update support * Update gocrawlhq * fix: change error with new URLs to debug for now. per corentin * update crawlhq. * update gocrawlhq * comment out useless error - per corentin "it's fine" * update gocrawlhq
1 parent fc0b683 commit 6d48952

File tree

3 files changed

+35
-21
lines changed

3 files changed

+35
-21
lines changed

go.mod

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ require (
1313
github.com/gosuri/uilive v0.0.4
1414
github.com/gosuri/uitable v0.0.4
1515
github.com/grafov/m3u8 v0.12.0
16-
github.com/internetarchive/gocrawlhq v1.2.14
16+
github.com/internetarchive/gocrawlhq v1.2.19
1717
github.com/paulbellamy/ratecounter v0.2.0
1818
github.com/philippgille/gokv/leveldb v0.7.0
1919
github.com/prometheus/client_golang v1.20.4
@@ -87,7 +87,7 @@ require (
8787
golang.org/x/exp v0.0.0-20240909161429-701f63a606c0 // indirect
8888
golang.org/x/mod v0.21.0 // indirect
8989
golang.org/x/sync v0.8.0 // indirect
90-
golang.org/x/sys v0.25.0 // indirect
90+
golang.org/x/sys v0.26.0 // indirect
9191
golang.org/x/text v0.18.0 // indirect
9292
golang.org/x/tools v0.25.0 // indirect
9393
gopkg.in/ini.v1 v1.67.0 // indirect

go.sum

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,16 @@ github.com/internetarchive/gocrawlhq v1.2.13 h1:ALfUrWR7nRez5gWhHRJ7ZklIpGMjERGM
8484
github.com/internetarchive/gocrawlhq v1.2.13/go.mod h1:JQIKgebFmpbxmEalNRjID3RwCxHkslt3PHAnum82KtM=
8585
github.com/internetarchive/gocrawlhq v1.2.14 h1:g3MPMonpA6mTkCpjBvW3paeBHiH+gGgwSvkyX/lxu7s=
8686
github.com/internetarchive/gocrawlhq v1.2.14/go.mod h1:IOHVfWsptADzh+r2J+UnSm22EB9r8TiVVeAuP9WRFoc=
87+
github.com/internetarchive/gocrawlhq v1.2.15 h1:Llv6tvxxRUxoC9G4GsjkpbfKX0anbQUU+pwFiROlxzg=
88+
github.com/internetarchive/gocrawlhq v1.2.15/go.mod h1:Rjkyx2ttWDG4vzXOrl7ilzdtbODJ3XSe2PkO77bxSTs=
89+
github.com/internetarchive/gocrawlhq v1.2.16 h1:D9JJdLL8uqpHUDU3SxxcXUjQETbxnk08e9xo929xrlE=
90+
github.com/internetarchive/gocrawlhq v1.2.16/go.mod h1:Rjkyx2ttWDG4vzXOrl7ilzdtbODJ3XSe2PkO77bxSTs=
91+
github.com/internetarchive/gocrawlhq v1.2.17 h1:nSjFHpDp5C9Q8SrDPibC4Iiih6kpw18+2GnifJiVpO0=
92+
github.com/internetarchive/gocrawlhq v1.2.17/go.mod h1:Rjkyx2ttWDG4vzXOrl7ilzdtbODJ3XSe2PkO77bxSTs=
93+
github.com/internetarchive/gocrawlhq v1.2.18 h1:PPe7UqJ2NNOljn70SmUhoKdgPreeqRUk9XVrYShCn4w=
94+
github.com/internetarchive/gocrawlhq v1.2.18/go.mod h1:Rjkyx2ttWDG4vzXOrl7ilzdtbODJ3XSe2PkO77bxSTs=
95+
github.com/internetarchive/gocrawlhq v1.2.19 h1:bvDliaeWjt97x64bOf+rKXStQX7VE+ZON/I1FS3sQ6A=
96+
github.com/internetarchive/gocrawlhq v1.2.19/go.mod h1:gHrdMewIi5OBWE/xEZGqSrNHyTXPbt+h+XUWpp9fZek=
8797
github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg=
8898
github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo=
8999
github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U=
@@ -239,6 +249,8 @@ golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
239249
golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
240250
golang.org/x/sys v0.25.0 h1:r+8e+loiHxRqhXVl6ML1nO3l1+oFoWbnlu2Ehimmi34=
241251
golang.org/x/sys v0.25.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
252+
golang.org/x/sys v0.26.0 h1:KHjCJyddX0LoSTb3J+vWpupP9p0oznkqVk/IfjymZbo=
253+
golang.org/x/sys v0.26.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
242254
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
243255
golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8=
244256
golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k=

internal/pkg/crawl/hq.go

Lines changed: 21 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ func (c *Crawl) HQProducer() {
7070
// is already closed, so no other goroutine can write to the slice
7171
if len(discoveredArray) > 0 {
7272
for {
73-
_, err := c.HQClient.Discovered(discoveredArray, "seed", false, false)
73+
err := c.HQClient.Add(discoveredArray, false)
7474
if err != nil {
7575
c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{})).Error("error sending payload to crawl HQ, waiting 1s then retrying..")
7676
time.Sleep(time.Second)
@@ -85,7 +85,7 @@ func (c *Crawl) HQProducer() {
8585
mutex.Lock()
8686
if (len(discoveredArray) >= int(math.Ceil(float64(c.Workers.Count)/2)) || time.Since(HQLastSent) >= time.Second*10) && len(discoveredArray) > 0 {
8787
for {
88-
_, err := c.HQClient.Discovered(discoveredArray, "seed", false, false)
88+
err := c.HQClient.Add(discoveredArray, false)
8989
if err != nil {
9090
c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{})).Error("error sending payload to crawl HQ, waiting 1s then retrying..")
9191
time.Sleep(time.Second)
@@ -123,7 +123,7 @@ func (c *Crawl) HQProducer() {
123123
// gob's encode/decode doesn't properly support booleans
124124
if discoveredItem.BypassSeencheck {
125125
for {
126-
_, err := c.HQClient.Discovered([]gocrawlhq.URL{discoveredURL}, "seed", true, false)
126+
err := c.HQClient.Add([]gocrawlhq.URL{discoveredURL}, true)
127127
if err != nil {
128128
c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{
129129
"bypassSeencheck": discoveredItem.BypassSeencheck,
@@ -177,20 +177,20 @@ func (c *Crawl) HQConsumer() {
177177

178178
// get batch from crawl HQ
179179
c.HQConsumerState = "waitingOnFeed"
180-
batch, err := c.HQClient.Feed(HQBatchSize, c.HQStrategy)
180+
URLs, err := c.HQClient.Feed(HQBatchSize, c.HQStrategy)
181181
if err != nil {
182-
c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{
183-
"batchSize": HQBatchSize,
184-
"err": err,
185-
})).Error("error getting new URLs from crawl HQ")
182+
// c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{
183+
// "batchSize": HQBatchSize,
184+
// "err": err,
185+
// })).Debug("error getting new URLs from crawl HQ")
186186
continue
187187
}
188188
c.HQConsumerState = "feedCompleted"
189189

190190
// send all URLs received in the batch to the queue
191-
var items = make([]*queue.Item, 0, len(batch.URLs))
192-
if len(batch.URLs) > 0 {
193-
for _, URL := range batch.URLs {
191+
var items = make([]*queue.Item, 0, len(URLs))
192+
if len(URLs) > 0 {
193+
for _, URL := range URLs {
194194
c.HQConsumerState = "urlParse"
195195
newURL, err := url.Parse(URL.Value)
196196
if err != nil {
@@ -246,7 +246,7 @@ func (c *Crawl) HQFinisher() {
246246

247247
if len(finishedArray) == int(math.Ceil(float64(c.Workers.Count)/2)) {
248248
for {
249-
_, err := c.HQClient.Finished(finishedArray, locallyCrawledTotal)
249+
err := c.HQClient.Delete(finishedArray, locallyCrawledTotal)
250250
if err != nil {
251251
c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{
252252
"finishedArray": finishedArray,
@@ -265,7 +265,7 @@ func (c *Crawl) HQFinisher() {
265265
// send remaining finished URLs
266266
if len(finishedArray) > 0 {
267267
for {
268-
_, err := c.HQClient.Finished(finishedArray, locallyCrawledTotal)
268+
err := c.HQClient.Delete(finishedArray, locallyCrawledTotal)
269269
if err != nil {
270270
c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{
271271
"finishedArray": finishedArray,
@@ -286,10 +286,11 @@ func (c *Crawl) HQSeencheckURLs(URLs []*url.URL) (seencheckedBatch []*url.URL, e
286286
for _, URL := range URLs {
287287
discoveredURLs = append(discoveredURLs, gocrawlhq.URL{
288288
Value: utils.URLToString(URL),
289+
Type: "asset",
289290
})
290291
}
291292

292-
discoveredResponse, err := c.HQClient.Discovered(discoveredURLs, "asset", false, true)
293+
outputURLs, err := c.HQClient.Seencheck(discoveredURLs)
293294
if err != nil {
294295
c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{
295296
"batchLen": len(URLs),
@@ -298,8 +299,8 @@ func (c *Crawl) HQSeencheckURLs(URLs []*url.URL) (seencheckedBatch []*url.URL, e
298299
return seencheckedBatch, err
299300
}
300301

301-
if discoveredResponse.URLs != nil {
302-
for _, URL := range discoveredResponse.URLs {
302+
if outputURLs != nil {
303+
for _, URL := range outputURLs {
303304
// the returned payload only contain new URLs to be crawled by Zeno
304305
newURL, err := url.Parse(URL.Value)
305306
if err != nil {
@@ -324,16 +325,17 @@ func (c *Crawl) HQSeencheckURLs(URLs []*url.URL) (seencheckedBatch []*url.URL, e
324325
func (c *Crawl) HQSeencheckURL(URL *url.URL) (bool, error) {
325326
discoveredURL := gocrawlhq.URL{
326327
Value: utils.URLToString(URL),
328+
Type: "asset",
327329
}
328330

329-
discoveredResponse, err := c.HQClient.Discovered([]gocrawlhq.URL{discoveredURL}, "asset", false, true)
331+
outputURLs, err := c.HQClient.Seencheck([]gocrawlhq.URL{discoveredURL})
330332
if err != nil {
331333
c.Log.Error("error sending seencheck payload to crawl HQ", "err", err, "url", utils.URLToString(URL))
332334
return true, err // return true, don't discard the URL if there's an error
333335
}
334336

335-
if discoveredResponse.URLs != nil {
336-
for _, URL := range discoveredResponse.URLs {
337+
if outputURLs != nil {
338+
for _, URL := range outputURLs {
337339
// the returned payload only contain new URLs to be crawled by Zeno
338340
if URL.Value == discoveredURL.Value {
339341
return true, nil

0 commit comments

Comments
 (0)