|
1 | 1 | package crawl |
2 | 2 |
|
3 | 3 | import ( |
| 4 | + "io" |
| 5 | + "net/http" |
4 | 6 | "net/url" |
5 | 7 | "regexp" |
6 | 8 | "strings" |
| 9 | + "sync/atomic" |
7 | 10 |
|
8 | 11 | "github.com/PuerkitoBio/goquery" |
9 | 12 | "github.com/internetarchive/Zeno/internal/pkg/crawl/extractor" |
10 | 13 | "github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/cloudflarestream" |
11 | 14 | "github.com/internetarchive/Zeno/internal/pkg/queue" |
12 | 15 | "github.com/internetarchive/Zeno/internal/pkg/utils" |
| 16 | + "github.com/remeh/sizedwaitgroup" |
13 | 17 | ) |
14 | 18 |
|
15 | 19 | var backgroundImageRegex = regexp.MustCompile(`(?:\(['"]?)(.*?)(?:['"]?\))`) |
16 | 20 | var urlRegex = regexp.MustCompile(`(?m)url\((.*?)\)`) |
17 | 21 |
|
| 22 | +func (c *Crawl) captureAsset(item *queue.Item, cookies []*http.Cookie, headers map[string]string) error { |
| 23 | + var resp *http.Response |
| 24 | + |
| 25 | + // Prepare GET request |
| 26 | + req, err := http.NewRequest("GET", utils.URLToString(item.URL), nil) |
| 27 | + if err != nil { |
| 28 | + return err |
| 29 | + } |
| 30 | + |
| 31 | + req.Header.Set("Referer", utils.URLToString(item.ParentURL)) |
| 32 | + req.Header.Set("User-Agent", c.UserAgent) |
| 33 | + |
| 34 | + // If headers are passed, apply them to the request |
| 35 | + if headers != nil { |
| 36 | + for key, value := range headers { |
| 37 | + req.Header.Set(key, value) |
| 38 | + } |
| 39 | + } |
| 40 | + |
| 41 | + // Apply cookies obtained from the original URL captured |
| 42 | + for i := range cookies { |
| 43 | + req.AddCookie(cookies[i]) |
| 44 | + } |
| 45 | + |
| 46 | + resp, err = c.executeGET(item, req, false) |
| 47 | + if err != nil && err.Error() == "URL from redirection has already been seen" { |
| 48 | + return nil |
| 49 | + } else if err != nil { |
| 50 | + return err |
| 51 | + } |
| 52 | + defer resp.Body.Close() |
| 53 | + |
| 54 | + if extractor.IsM3U8(resp) { |
| 55 | + assets, err := extractor.M3U8(resp) |
| 56 | + if err == nil { |
| 57 | + c.captureAssets(item, assets, cookies, headers) |
| 58 | + } else { |
| 59 | + c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract URLs from M3U8") |
| 60 | + } |
| 61 | + } |
| 62 | + |
| 63 | + io.Copy(io.Discard, resp.Body) |
| 64 | + |
| 65 | + return nil |
| 66 | +} |
| 67 | + |
| 68 | +func (c *Crawl) captureAssets(item *queue.Item, assets []*url.URL, cookies []*http.Cookie, headers map[string]string) { |
| 69 | + // TODO: implement a counter for the number of assets |
| 70 | + // currently being processed |
| 71 | + // c.Frontier.QueueCount.Incr(int64(len(assets))) |
| 72 | + swg := sizedwaitgroup.New(int(c.MaxConcurrentAssets)) |
| 73 | + excluded := false |
| 74 | + |
| 75 | + for _, asset := range assets { |
| 76 | + // TODO: implement a counter for the number of assets |
| 77 | + // currently being processed |
| 78 | + // c.Frontier.QueueCount.Incr(-1) |
| 79 | + |
| 80 | + // Just making sure we do not over archive by archiving the original URL |
| 81 | + if utils.URLToString(item.URL) == utils.URLToString(asset) { |
| 82 | + continue |
| 83 | + } |
| 84 | + |
| 85 | + // If the URL match any excluded string, we ignore it |
| 86 | + for _, excludedString := range c.ExcludedStrings { |
| 87 | + if strings.Contains(utils.URLToString(asset), excludedString) { |
| 88 | + excluded = true |
| 89 | + break |
| 90 | + } |
| 91 | + } |
| 92 | + |
| 93 | + if excluded { |
| 94 | + excluded = false |
| 95 | + continue |
| 96 | + } |
| 97 | + |
| 98 | + swg.Add() |
| 99 | + c.URIsPerSecond.Incr(1) |
| 100 | + |
| 101 | + go func(asset *url.URL, swg *sizedwaitgroup.SizedWaitGroup) { |
| 102 | + defer swg.Done() |
| 103 | + |
| 104 | + // Create the asset's item |
| 105 | + newAsset, err := queue.NewItem(asset, item.URL, "asset", item.Hop, "", false) |
| 106 | + if err != nil { |
| 107 | + c.Log.WithFields(c.genLogFields(err, asset, map[string]interface{}{ |
| 108 | + "parentHop": item.Hop, |
| 109 | + "parentUrl": utils.URLToString(item.URL), |
| 110 | + "type": "asset", |
| 111 | + })).Error("error while creating asset item") |
| 112 | + return |
| 113 | + } |
| 114 | + |
| 115 | + // Capture the asset |
| 116 | + err = c.captureAsset(newAsset, cookies, headers) |
| 117 | + if err != nil { |
| 118 | + c.Log.WithFields(c.genLogFields(err, &asset, map[string]interface{}{ |
| 119 | + "parentHop": item.Hop, |
| 120 | + "parentUrl": utils.URLToString(item.URL), |
| 121 | + "type": "asset", |
| 122 | + })).Error("error while capturing asset") |
| 123 | + return |
| 124 | + } |
| 125 | + |
| 126 | + // If we made it to this point, it means that the asset have been crawled successfully, |
| 127 | + // then we can increment the locallyCrawled variable |
| 128 | + atomic.AddUint64(&item.LocallyCrawled, 1) |
| 129 | + }(asset, &swg) |
| 130 | + } |
| 131 | + |
| 132 | + swg.Wait() |
| 133 | +} |
| 134 | + |
18 | 135 | func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Document) (assets []*url.URL, err error) { |
19 | 136 | var rawAssets []string |
20 | 137 | var URL = utils.URLToString(item.URL) |
@@ -198,7 +315,7 @@ func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Docu |
198 | 315 | if err != nil { |
199 | 316 | c.Log.Error("unable to extract URLs from JSON in script tag", "error", err, "url", URL) |
200 | 317 | } else { |
201 | | - rawAssets = append(rawAssets, removeGoogleVideoURLs(URLsFromJSON)...) |
| 318 | + rawAssets = append(rawAssets, URLsFromJSON...) |
202 | 319 | } |
203 | 320 | } |
204 | 321 | } |
@@ -274,21 +391,26 @@ func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Docu |
274 | 391 | // Turn strings into url.URL |
275 | 392 | assets = append(assets, utils.StringSliceToURLSlice(rawAssets)...) |
276 | 393 |
|
277 | | - // Ensure that excluded hosts aren't in the assets. |
278 | | - assets = c.excludeHosts(assets) |
279 | | - |
280 | | - // Go over all assets and outlinks and make sure they are absolute links |
281 | | - assets = utils.MakeAbsolute(base, assets) |
| 394 | + // Ensure that no asset that would be excluded is added to the list, |
| 395 | + // remove all fragments, and make sure that all assets are absolute URLs |
| 396 | + assets = c.cleanURLs(base, assets) |
282 | 397 |
|
283 | 398 | return utils.DedupeURLs(assets), nil |
284 | 399 | } |
285 | 400 |
|
286 | | -func removeGoogleVideoURLs(input []string) (output []string) { |
287 | | - for _, i := range input { |
288 | | - if !strings.Contains(i, "googlevideo.com") { |
289 | | - output = append(output, i) |
| 401 | +func (c *Crawl) cleanURLs(base *url.URL, URLs []*url.URL) (output []*url.URL) { |
| 402 | + // Remove excluded URLs |
| 403 | + for _, URL := range URLs { |
| 404 | + if !c.isExcluded(URL) { |
| 405 | + output = append(output, URL) |
290 | 406 | } |
291 | 407 | } |
292 | 408 |
|
293 | | - return output |
| 409 | + // Make all URLs absolute |
| 410 | + if base != nil { |
| 411 | + output = utils.MakeAbsolute(base, output) |
| 412 | + } |
| 413 | + |
| 414 | + // Remove fragments |
| 415 | + return utils.RemoveFragments(output) |
294 | 416 | } |
0 commit comments