Skip to content

Commit fc0b683

Browse files
CorentinBNGTmeaty
andauthored
Add S3 extractor (#153)
Co-authored-by: Jake L <NGTmeaty@users.noreply.github.com>
1 parent 1fca2f0 commit fc0b683

File tree

4 files changed

+227
-91
lines changed

4 files changed

+227
-91
lines changed

go.mod

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ module github.com/internetarchive/Zeno
33
go 1.22.4
44

55
require (
6-
github.com/internetarchive/gocrawlhq v1.2.14
76
github.com/CorentinB/warc v0.8.53
87
github.com/PuerkitoBio/goquery v1.9.3
98
github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2
@@ -14,6 +13,7 @@ require (
1413
github.com/gosuri/uilive v0.0.4
1514
github.com/gosuri/uitable v0.0.4
1615
github.com/grafov/m3u8 v0.12.0
16+
github.com/internetarchive/gocrawlhq v1.2.14
1717
github.com/paulbellamy/ratecounter v0.2.0
1818
github.com/philippgille/gokv/leveldb v0.7.0
1919
github.com/prometheus/client_golang v1.20.4
@@ -32,6 +32,7 @@ require (
3232
require (
3333
github.com/andybalholm/brotli v1.1.0 // indirect
3434
github.com/andybalholm/cascadia v1.3.2 // indirect
35+
github.com/aws/aws-sdk-go v1.55.5 // indirect
3536
github.com/beorn7/perks v1.0.1 // indirect
3637
github.com/cespare/xxhash/v2 v2.3.0 // indirect
3738
github.com/cloudflare/circl v1.4.0 // indirect
@@ -49,6 +50,7 @@ require (
4950
github.com/google/go-cmp v0.6.0 // indirect
5051
github.com/hashicorp/hcl v1.0.0 // indirect
5152
github.com/inconshreveable/mousetrap v1.1.0 // indirect
53+
github.com/jmespath/go-jmespath v0.4.0 // indirect
5254
github.com/json-iterator/go v1.1.12 // indirect
5355
github.com/klauspost/compress v1.17.10 // indirect
5456
github.com/magiconair/properties v1.8.7 // indirect

go.sum

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPd
1616
github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs=
1717
github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 h1:DklsrG3dyBCFEj5IhUbnKptjxatkF07cF2ak3yi77so=
1818
github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2/go.mod h1:WaHUgvxTVq04UNunO+XhnAqY/wQc+bxr74GqbsZ/Jqw=
19+
github.com/aws/aws-sdk-go v1.55.5 h1:KKUZBfBoyqy5d3swXyiC7Q76ic40rYcbqH7qjh59kzU=
20+
github.com/aws/aws-sdk-go v1.55.5/go.mod h1:eRwEWoyTWFMVYVQzKMNHWP5/RV4xIUGMQfXQHfHkpNU=
1921
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
2022
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
2123
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
@@ -82,6 +84,9 @@ github.com/internetarchive/gocrawlhq v1.2.13 h1:ALfUrWR7nRez5gWhHRJ7ZklIpGMjERGM
8284
github.com/internetarchive/gocrawlhq v1.2.13/go.mod h1:JQIKgebFmpbxmEalNRjID3RwCxHkslt3PHAnum82KtM=
8385
github.com/internetarchive/gocrawlhq v1.2.14 h1:g3MPMonpA6mTkCpjBvW3paeBHiH+gGgwSvkyX/lxu7s=
8486
github.com/internetarchive/gocrawlhq v1.2.14/go.mod h1:IOHVfWsptADzh+r2J+UnSm22EB9r8TiVVeAuP9WRFoc=
87+
github.com/jmespath/go-jmespath v0.4.0 h1:BEgLn5cpjn8UN1mAw4NjwDrS35OdebyEtFe+9YPoQUg=
88+
github.com/jmespath/go-jmespath v0.4.0/go.mod h1:T8mJZnbsbmF+m6zOOFylbeCJqk5+pHWvzYPziyZiYoo=
89+
github.com/jmespath/go-jmespath/internal/testify v1.5.1/go.mod h1:L3OGu8Wl2/fWfCI6z80xFu9LTZmf1ZRjMHUOPmWr69U=
8590
github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
8691
github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
8792
github.com/klauspost/compress v1.17.10 h1:oXAz+Vh0PMUvJczoi+flxpnBEPxoER1IaAnU/NMPtT0=
@@ -264,6 +269,7 @@ gopkg.in/ini.v1 v1.67.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k=
264269
gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ=
265270
gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw=
266271
gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
272+
gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
267273
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
268274
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
269275
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

internal/pkg/crawl/capture.go

Lines changed: 94 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -457,16 +457,25 @@ func (c *Crawl) Capture(item *queue.Item) error {
457457
}
458458

459459
// If the response is an XML document, we want to scrape it for links
460+
var outlinks []*url.URL
460461
if strings.Contains(resp.Header.Get("Content-Type"), "xml") {
461-
URLsFromXML, isSitemap, err := extractor.XML(resp)
462-
if err != nil {
463-
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract URLs from XML")
462+
if extractor.IsS3(resp) {
463+
URLsFromS3, err := extractor.S3(resp)
464+
if err != nil {
465+
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while extracting URLs from S3")
466+
}
467+
468+
outlinks = append(outlinks, URLsFromS3...)
464469
} else {
465-
if isSitemap {
466-
waitGroup.Add(1)
467-
go c.queueOutlinks(URLsFromXML, item, &waitGroup)
470+
URLsFromXML, isSitemap, err := extractor.XML(resp)
471+
if err != nil {
472+
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract URLs from XML")
468473
} else {
469-
assets = append(assets, URLsFromXML...)
474+
if isSitemap {
475+
outlinks = append(outlinks, URLsFromXML...)
476+
} else {
477+
assets = append(assets, URLsFromXML...)
478+
}
470479
}
471480
}
472481
} else if strings.Contains(resp.Header.Get("Content-Type"), "json") {
@@ -488,111 +497,106 @@ func (c *Crawl) Capture(item *queue.Item) error {
488497
}
489498

490499
return err
491-
}
492-
493-
// Turn the response into a doc that we will scrape for outlinks and assets.
494-
doc, err := goquery.NewDocumentFromReader(resp.Body)
495-
if err != nil {
496-
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while creating goquery document")
497-
return err
498-
}
499-
500-
// Execute site-specific code on the document
501-
if cloudflarestream.IsURL(base.Host) {
502-
// Look for JS files necessary for the playback of the video
503-
cfstreamURLs, err := cloudflarestream.GetJSFiles(doc, base, *c.Client)
500+
} else {
501+
// Turn the response into a doc that we will scrape for outlinks and assets.
502+
doc, err := goquery.NewDocumentFromReader(resp.Body)
504503
if err != nil {
505-
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while getting JS files from cloudflarestream")
504+
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while creating goquery document")
506505
return err
507506
}
508507

509-
// Seencheck the URLs we captured, we ignore the returned value here
510-
// because we already archived the URLs, we just want them to be added
511-
// to the seencheck table.
512-
if c.UseSeencheck {
513-
if c.UseHQ {
514-
_, err := c.HQSeencheckURLs(utils.StringSliceToURLSlice(cfstreamURLs))
515-
if err != nil {
516-
c.Log.WithFields(c.genLogFields(err, item.URL, map[string]interface{}{
517-
"urls": cfstreamURLs,
518-
})).Error("error while seenchecking assets via HQ")
519-
}
520-
} else {
521-
for _, cfstreamURL := range cfstreamURLs {
522-
c.Seencheck.SeencheckURL(cfstreamURL, "asset")
523-
}
524-
}
525-
}
526-
// Log the archived URLs
527-
for _, cfstreamURL := range cfstreamURLs {
528-
c.Log.WithFields(c.genLogFields(err, cfstreamURL, map[string]interface{}{
529-
"parentHop": item.Hop,
530-
"parentUrl": utils.URLToString(item.URL),
531-
"type": "asset",
532-
})).Info("URL archived")
533-
}
534-
} else if ina.IsURL(req) {
535-
playerURLs := ina.ExtractPlayerURLs(doc, c.Client)
536-
537-
for _, playerURL := range playerURLs {
538-
playerItem, err := queue.NewItem(playerURL, item.URL, "seed", 0, "", false)
508+
// Execute site-specific code on the document
509+
if cloudflarestream.IsURL(utils.URLToString(item.URL)) {
510+
// Look for JS files necessary for the playback of the video
511+
cfstreamURLs, err := cloudflarestream.GetJSFiles(doc, base, *c.Client)
539512
if err != nil {
540-
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to create new item from player URL")
541-
} else {
542-
c.Capture(playerItem)
513+
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while getting JS files from cloudflarestream")
514+
return err
543515
}
544-
}
545-
}
546-
547-
// Websites can use a <base> tag to specify a base for relative URLs in every other tags.
548-
// This checks for the "base" tag and resets the "base" URL variable with the new base URL specified
549-
// https://developer.mozilla.org/en-US/docs/Web/HTML/Element/base
550-
if !utils.StringInSlice("base", c.DisabledHTMLTags) {
551-
oldBase := base
552516

553-
doc.Find("base").Each(func(index int, goitem *goquery.Selection) {
554-
// If a new base got scraped, stop looking for one
555-
if oldBase != base {
556-
return
517+
// Seencheck the URLs we captured, we ignore the returned value here
518+
// because we already archived the URLs, we just want them to be added
519+
// to the seencheck table.
520+
if c.UseSeencheck {
521+
if c.UseHQ {
522+
_, err := c.HQSeencheckURLs(utils.StringSliceToURLSlice(cfstreamURLs))
523+
if err != nil {
524+
c.Log.WithFields(c.genLogFields(err, item.URL, map[string]interface{}{
525+
"urls": cfstreamURLs,
526+
})).Error("error while seenchecking assets via HQ")
527+
}
528+
} else {
529+
for _, cfstreamURL := range cfstreamURLs {
530+
c.Seencheck.SeencheckURL(cfstreamURL, "asset")
531+
}
532+
}
557533
}
534+
// Log the archived URLs
535+
for _, cfstreamURL := range cfstreamURLs {
536+
c.Log.WithFields(c.genLogFields(err, cfstreamURL, map[string]interface{}{
537+
"parentHop": item.Hop,
538+
"parentUrl": utils.URLToString(item.URL),
539+
"type": "asset",
540+
})).Info("URL archived")
541+
}
542+
} else if ina.IsURL(req) {
543+
playerURLs := ina.ExtractPlayerURLs(doc, c.Client)
558544

559-
// Attempt to get a new base value from the base HTML tag
560-
link, exists := goitem.Attr("href")
561-
if exists {
562-
baseTagValue, err := url.Parse(link)
545+
for _, playerURL := range playerURLs {
546+
playerItem, err := queue.NewItem(playerURL, item.URL, "seed", 0, "", false)
563547
if err != nil {
564-
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while parsing base tag value")
548+
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to create new item from player URL")
565549
} else {
566-
base = baseTagValue
550+
c.Capture(playerItem)
567551
}
568552
}
569-
})
570-
}
553+
}
571554

572-
// Extract outlinks
573-
outlinks, err := c.extractOutlinks(base, doc)
574-
if err != nil {
575-
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while extracting outlinks")
576-
return err
577-
}
555+
// Websites can use a <base> tag to specify a base for relative URLs in every other tags.
556+
// This checks for the "base" tag and resets the "base" URL variable with the new base URL specified
557+
// https://developer.mozilla.org/en-US/docs/Web/HTML/Element/base
558+
if !utils.StringInSlice("base", c.DisabledHTMLTags) {
559+
oldBase := base
578560

579-
waitGroup.Add(1)
580-
go c.queueOutlinks(outlinks, item, &waitGroup)
561+
doc.Find("base").Each(func(index int, goitem *goquery.Selection) {
562+
// If a new base got scraped, stop looking for one
563+
if oldBase != base {
564+
return
565+
}
581566

582-
if c.DisableAssetsCapture {
583-
return err
584-
}
567+
// Attempt to get a new base value from the base HTML tag
568+
link, exists := goitem.Attr("href")
569+
if exists {
570+
baseTagValue, err := url.Parse(link)
571+
if err != nil {
572+
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while parsing base tag value")
573+
} else {
574+
base = baseTagValue
575+
}
576+
}
577+
})
578+
}
585579

586-
// Extract and capture assets (only if we didn't use an extractor that produce assets)
587-
if len(assets) == 0 {
588-
assets, err = c.extractAssets(base, item, doc)
580+
// Extract outlinks
581+
outlinks, err = c.extractOutlinks(base, doc)
589582
if err != nil {
590-
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while extracting assets")
583+
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while extracting outlinks")
591584
return err
592585
}
586+
587+
if !c.DisableAssetsCapture {
588+
assets, err = c.extractAssets(base, item, doc)
589+
if err != nil {
590+
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while extracting assets")
591+
return err
592+
}
593+
}
593594
}
594595

595-
if len(assets) != 0 {
596+
waitGroup.Add(1)
597+
go c.queueOutlinks(outlinks, item, &waitGroup)
598+
599+
if !c.DisableAssetsCapture && len(assets) != 0 {
596600
assets = c.seencheckAssets(assets, item)
597601
if len(assets) != 0 {
598602
c.captureAssets(item, assets, resp.Cookies(), nil)

0 commit comments

Comments
 (0)