@@ -457,16 +457,25 @@ func (c *Crawl) Capture(item *queue.Item) error {
457457 }
458458
459459 // If the response is an XML document, we want to scrape it for links
460+ var outlinks []* url.URL
460461 if strings .Contains (resp .Header .Get ("Content-Type" ), "xml" ) {
461- URLsFromXML , isSitemap , err := extractor .XML (resp )
462- if err != nil {
463- c .Log .WithFields (c .genLogFields (err , item .URL , nil )).Error ("unable to extract URLs from XML" )
462+ if extractor .IsS3 (resp ) {
463+ URLsFromS3 , err := extractor .S3 (resp )
464+ if err != nil {
465+ c .Log .WithFields (c .genLogFields (err , item .URL , nil )).Error ("error while extracting URLs from S3" )
466+ }
467+
468+ outlinks = append (outlinks , URLsFromS3 ... )
464469 } else {
465- if isSitemap {
466- waitGroup . Add ( 1 )
467- go c . queueOutlinks ( URLsFromXML , item , & waitGroup )
470+ URLsFromXML , isSitemap , err := extractor . XML ( resp )
471+ if err != nil {
472+ c . Log . WithFields ( c . genLogFields ( err , item . URL , nil )). Error ( "unable to extract URLs from XML" )
468473 } else {
469- assets = append (assets , URLsFromXML ... )
474+ if isSitemap {
475+ outlinks = append (outlinks , URLsFromXML ... )
476+ } else {
477+ assets = append (assets , URLsFromXML ... )
478+ }
470479 }
471480 }
472481 } else if strings .Contains (resp .Header .Get ("Content-Type" ), "json" ) {
@@ -488,111 +497,106 @@ func (c *Crawl) Capture(item *queue.Item) error {
488497 }
489498
490499 return err
491- }
492-
493- // Turn the response into a doc that we will scrape for outlinks and assets.
494- doc , err := goquery .NewDocumentFromReader (resp .Body )
495- if err != nil {
496- c .Log .WithFields (c .genLogFields (err , item .URL , nil )).Error ("error while creating goquery document" )
497- return err
498- }
499-
500- // Execute site-specific code on the document
501- if cloudflarestream .IsURL (base .Host ) {
502- // Look for JS files necessary for the playback of the video
503- cfstreamURLs , err := cloudflarestream .GetJSFiles (doc , base , * c .Client )
500+ } else {
501+ // Turn the response into a doc that we will scrape for outlinks and assets.
502+ doc , err := goquery .NewDocumentFromReader (resp .Body )
504503 if err != nil {
505- c .Log .WithFields (c .genLogFields (err , item .URL , nil )).Error ("error while getting JS files from cloudflarestream " )
504+ c .Log .WithFields (c .genLogFields (err , item .URL , nil )).Error ("error while creating goquery document " )
506505 return err
507506 }
508507
509- // Seencheck the URLs we captured, we ignore the returned value here
510- // because we already archived the URLs, we just want them to be added
511- // to the seencheck table.
512- if c .UseSeencheck {
513- if c .UseHQ {
514- _ , err := c .HQSeencheckURLs (utils .StringSliceToURLSlice (cfstreamURLs ))
515- if err != nil {
516- c .Log .WithFields (c .genLogFields (err , item .URL , map [string ]interface {}{
517- "urls" : cfstreamURLs ,
518- })).Error ("error while seenchecking assets via HQ" )
519- }
520- } else {
521- for _ , cfstreamURL := range cfstreamURLs {
522- c .Seencheck .SeencheckURL (cfstreamURL , "asset" )
523- }
524- }
525- }
526- // Log the archived URLs
527- for _ , cfstreamURL := range cfstreamURLs {
528- c .Log .WithFields (c .genLogFields (err , cfstreamURL , map [string ]interface {}{
529- "parentHop" : item .Hop ,
530- "parentUrl" : utils .URLToString (item .URL ),
531- "type" : "asset" ,
532- })).Info ("URL archived" )
533- }
534- } else if ina .IsURL (req ) {
535- playerURLs := ina .ExtractPlayerURLs (doc , c .Client )
536-
537- for _ , playerURL := range playerURLs {
538- playerItem , err := queue .NewItem (playerURL , item .URL , "seed" , 0 , "" , false )
508+ // Execute site-specific code on the document
509+ if cloudflarestream .IsURL (utils .URLToString (item .URL )) {
510+ // Look for JS files necessary for the playback of the video
511+ cfstreamURLs , err := cloudflarestream .GetJSFiles (doc , base , * c .Client )
539512 if err != nil {
540- c .Log .WithFields (c .genLogFields (err , item .URL , nil )).Error ("unable to create new item from player URL" )
541- } else {
542- c .Capture (playerItem )
513+ c .Log .WithFields (c .genLogFields (err , item .URL , nil )).Error ("error while getting JS files from cloudflarestream" )
514+ return err
543515 }
544- }
545- }
546-
547- // Websites can use a <base> tag to specify a base for relative URLs in every other tags.
548- // This checks for the "base" tag and resets the "base" URL variable with the new base URL specified
549- // https://developer.mozilla.org/en-US/docs/Web/HTML/Element/base
550- if ! utils .StringInSlice ("base" , c .DisabledHTMLTags ) {
551- oldBase := base
552516
553- doc .Find ("base" ).Each (func (index int , goitem * goquery.Selection ) {
554- // If a new base got scraped, stop looking for one
555- if oldBase != base {
556- return
517+ // Seencheck the URLs we captured, we ignore the returned value here
518+ // because we already archived the URLs, we just want them to be added
519+ // to the seencheck table.
520+ if c .UseSeencheck {
521+ if c .UseHQ {
522+ _ , err := c .HQSeencheckURLs (utils .StringSliceToURLSlice (cfstreamURLs ))
523+ if err != nil {
524+ c .Log .WithFields (c .genLogFields (err , item .URL , map [string ]interface {}{
525+ "urls" : cfstreamURLs ,
526+ })).Error ("error while seenchecking assets via HQ" )
527+ }
528+ } else {
529+ for _ , cfstreamURL := range cfstreamURLs {
530+ c .Seencheck .SeencheckURL (cfstreamURL , "asset" )
531+ }
532+ }
557533 }
534+ // Log the archived URLs
535+ for _ , cfstreamURL := range cfstreamURLs {
536+ c .Log .WithFields (c .genLogFields (err , cfstreamURL , map [string ]interface {}{
537+ "parentHop" : item .Hop ,
538+ "parentUrl" : utils .URLToString (item .URL ),
539+ "type" : "asset" ,
540+ })).Info ("URL archived" )
541+ }
542+ } else if ina .IsURL (req ) {
543+ playerURLs := ina .ExtractPlayerURLs (doc , c .Client )
558544
559- // Attempt to get a new base value from the base HTML tag
560- link , exists := goitem .Attr ("href" )
561- if exists {
562- baseTagValue , err := url .Parse (link )
545+ for _ , playerURL := range playerURLs {
546+ playerItem , err := queue .NewItem (playerURL , item .URL , "seed" , 0 , "" , false )
563547 if err != nil {
564- c .Log .WithFields (c .genLogFields (err , item .URL , nil )).Error ("error while parsing base tag value " )
548+ c .Log .WithFields (c .genLogFields (err , item .URL , nil )).Error ("unable to create new item from player URL " )
565549 } else {
566- base = baseTagValue
550+ c . Capture ( playerItem )
567551 }
568552 }
569- })
570- }
553+ }
571554
572- // Extract outlinks
573- outlinks , err := c .extractOutlinks (base , doc )
574- if err != nil {
575- c .Log .WithFields (c .genLogFields (err , item .URL , nil )).Error ("error while extracting outlinks" )
576- return err
577- }
555+ // Websites can use a <base> tag to specify a base for relative URLs in every other tags.
556+ // This checks for the "base" tag and resets the "base" URL variable with the new base URL specified
557+ // https://developer.mozilla.org/en-US/docs/Web/HTML/Element/base
558+ if ! utils .StringInSlice ("base" , c .DisabledHTMLTags ) {
559+ oldBase := base
578560
579- waitGroup .Add (1 )
580- go c .queueOutlinks (outlinks , item , & waitGroup )
561+ doc .Find ("base" ).Each (func (index int , goitem * goquery.Selection ) {
562+ // If a new base got scraped, stop looking for one
563+ if oldBase != base {
564+ return
565+ }
581566
582- if c .DisableAssetsCapture {
583- return err
584- }
567+ // Attempt to get a new base value from the base HTML tag
568+ link , exists := goitem .Attr ("href" )
569+ if exists {
570+ baseTagValue , err := url .Parse (link )
571+ if err != nil {
572+ c .Log .WithFields (c .genLogFields (err , item .URL , nil )).Error ("error while parsing base tag value" )
573+ } else {
574+ base = baseTagValue
575+ }
576+ }
577+ })
578+ }
585579
586- // Extract and capture assets (only if we didn't use an extractor that produce assets)
587- if len (assets ) == 0 {
588- assets , err = c .extractAssets (base , item , doc )
580+ // Extract outlinks
581+ outlinks , err = c .extractOutlinks (base , doc )
589582 if err != nil {
590- c .Log .WithFields (c .genLogFields (err , item .URL , nil )).Error ("error while extracting assets " )
583+ c .Log .WithFields (c .genLogFields (err , item .URL , nil )).Error ("error while extracting outlinks " )
591584 return err
592585 }
586+
587+ if ! c .DisableAssetsCapture {
588+ assets , err = c .extractAssets (base , item , doc )
589+ if err != nil {
590+ c .Log .WithFields (c .genLogFields (err , item .URL , nil )).Error ("error while extracting assets" )
591+ return err
592+ }
593+ }
593594 }
594595
595- if len (assets ) != 0 {
596+ waitGroup .Add (1 )
597+ go c .queueOutlinks (outlinks , item , & waitGroup )
598+
599+ if ! c .DisableAssetsCapture && len (assets ) != 0 {
596600 assets = c .seencheckAssets (assets , item )
597601 if len (assets ) != 0 {
598602 c .captureAssets (item , assets , resp .Cookies (), nil )
0 commit comments