Skip to content

Commit cfa2980

Browse files
authored
Add proper YouTube archiving via YT-DLP (#126)
* add: yt-dlp support to gather YouTube URLs from watch pages * [site/yt] add: format selection & metadata record * [ext/m3u8] initial commit * fix: remove default global HTTP timeout * [site/yt] wip: fix tests * chores: small refactoring * [site/yt] fix test * ytdlp: remove useless subtitles parsing function * m3u8: handle content-type case insensitively * chore: small refactoring * ytdlp: add dubbed audio streams * ytdlp: format selection & refactoring
1 parent 6d512bb commit cfa2980

File tree

21 files changed

+714
-174
lines changed

21 files changed

+714
-174
lines changed

cmd/get.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ func getCMDsFlags(getCmd *cobra.Command) {
4343
getCmd.PersistentFlags().String("prometheus-prefix", "zeno:", "String used as a prefix for the exported Prometheus metrics.")
4444
getCmd.PersistentFlags().Int("max-redirect", 20, "Specifies the maximum number of redirections to follow for a resource.")
4545
getCmd.PersistentFlags().Int("max-retry", 5, "Number of retry if error happen when executing HTTP request.")
46-
getCmd.PersistentFlags().Int("http-timeout", 30, "Number of seconds to wait before timing out a request.")
46+
getCmd.PersistentFlags().Int("http-timeout", -1, "Number of seconds to wait before timing out a request.")
4747
getCmd.PersistentFlags().Bool("domains-crawl", false, "If this is turned on, seeds will be treated as domains to crawl, therefore same-domain outlinks will be added to the queue as hop=0.")
4848
getCmd.PersistentFlags().StringSlice("disable-html-tag", []string{}, "Specify HTML tag to not extract assets from")
4949
getCmd.PersistentFlags().Bool("capture-alternate-pages", false, "If turned on, <link> HTML tags with \"alternate\" values for their \"rel\" attribute will be archived.")
@@ -84,6 +84,10 @@ func getCMDsFlags(getCmd *cobra.Command) {
8484
getCmd.PersistentFlags().String("es-password", "", "ElasticSearch password to use for indexing crawl logs.")
8585
getCmd.PersistentFlags().String("es-index-prefix", "zeno", "ElasticSearch index prefix to use for indexing crawl logs. Default is : `zeno`, without `-`")
8686

87+
// Dependencies flags
88+
getCmd.PersistentFlags().Bool("no-ytdlp", false, "Disable youtube-dlp usage for video extraction.")
89+
getCmd.PersistentFlags().String("ytdlp-path", "", "Path to youtube-dlp binary.")
90+
8791
// Alias support
8892
// As cobra doesn't support aliases natively (couldn't find a way to do it), we have to do it manually
8993
// This is a workaround to allow users to use `--hops` instead of `--max-hops` for example

config/config.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,10 @@ type Config struct {
7676
NoStdoutLogging bool `mapstructure:"no-stdout-log"`
7777
NoBatchWriteWAL bool `mapstructure:"ultrasafe-queue"`
7878
Handover bool `mapstructure:"handover"`
79+
80+
// Dependencies
81+
NoYTDLP bool `mapstructure:"no-ytdlp"`
82+
YTDLPPath string `mapstructure:"ytdlp-path"`
7983
}
8084

8185
var (

go.mod

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ require (
1313
github.com/google/uuid v1.6.0
1414
github.com/gosuri/uilive v0.0.4
1515
github.com/gosuri/uitable v0.0.4
16+
github.com/grafov/m3u8 v0.12.0
1617
github.com/paulbellamy/ratecounter v0.2.0
1718
github.com/philippgille/gokv/leveldb v0.7.0
1819
github.com/prometheus/client_golang v1.20.3

go.sum

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,12 @@ github.com/gosuri/uilive v0.0.4 h1:hUEBpQDj8D8jXgtCdBu7sWsy5sbW/5GhuO8KBwJ2jyY=
5757
github.com/gosuri/uilive v0.0.4/go.mod h1:V/epo5LjjlDE5RJUcqx8dbw+zc93y5Ya3yg8tfZ74VI=
5858
github.com/gosuri/uitable v0.0.4 h1:IG2xLKRvErL3uhY6e1BylFzG+aJiwQviDDTfOKeKTpY=
5959
github.com/gosuri/uitable v0.0.4/go.mod h1:tKR86bXuXPZazfOTG1FIzvjIdXzd0mo4Vtn16vt0PJo=
60+
github.com/grafana/pyroscope-go v1.1.2 h1:7vCfdORYQMCxIzI3NlYAs3FcBP760+gWuYWOyiVyYx8=
61+
github.com/grafana/pyroscope-go v1.1.2/go.mod h1:HSSmHo2KRn6FasBA4vK7BMiQqyQq8KSuBKvrhkXxYPU=
62+
github.com/grafana/pyroscope-go/godeltaprof v0.1.8 h1:iwOtYXeeVSAeYefJNaxDytgjKtUuKQbJqgAIjlnicKg=
63+
github.com/grafana/pyroscope-go/godeltaprof v0.1.8/go.mod h1:2+l7K7twW49Ct4wFluZD3tZ6e0SjanjcUUBPVD/UuGU=
64+
github.com/grafov/m3u8 v0.12.0 h1:T6iTwTsSEtMcwkayef+FJO8kj+Sglr4Lh81Zj8Ked/4=
65+
github.com/grafov/m3u8 v0.12.0/go.mod h1:nqzOkfBiZJENr52zTVd/Dcl03yzphIMbJqkXGu+u080=
6066
github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4=
6167
github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ=
6268
github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU=

internal/pkg/crawl/assets.go

Lines changed: 133 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,137 @@
11
package crawl
22

33
import (
4+
"io"
5+
"net/http"
46
"net/url"
57
"regexp"
68
"strings"
9+
"sync/atomic"
710

811
"github.com/PuerkitoBio/goquery"
912
"github.com/internetarchive/Zeno/internal/pkg/crawl/extractor"
1013
"github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/cloudflarestream"
1114
"github.com/internetarchive/Zeno/internal/pkg/queue"
1215
"github.com/internetarchive/Zeno/internal/pkg/utils"
16+
"github.com/remeh/sizedwaitgroup"
1317
)
1418

1519
var backgroundImageRegex = regexp.MustCompile(`(?:\(['"]?)(.*?)(?:['"]?\))`)
1620
var urlRegex = regexp.MustCompile(`(?m)url\((.*?)\)`)
1721

22+
func (c *Crawl) captureAsset(item *queue.Item, cookies []*http.Cookie, headers map[string]string) error {
23+
var resp *http.Response
24+
25+
// Prepare GET request
26+
req, err := http.NewRequest("GET", utils.URLToString(item.URL), nil)
27+
if err != nil {
28+
return err
29+
}
30+
31+
req.Header.Set("Referer", utils.URLToString(item.ParentURL))
32+
req.Header.Set("User-Agent", c.UserAgent)
33+
34+
// If headers are passed, apply them to the request
35+
if headers != nil {
36+
for key, value := range headers {
37+
req.Header.Set(key, value)
38+
}
39+
}
40+
41+
// Apply cookies obtained from the original URL captured
42+
for i := range cookies {
43+
req.AddCookie(cookies[i])
44+
}
45+
46+
resp, err = c.executeGET(item, req, false)
47+
if err != nil && err.Error() == "URL from redirection has already been seen" {
48+
return nil
49+
} else if err != nil {
50+
return err
51+
}
52+
defer resp.Body.Close()
53+
54+
if extractor.IsM3U8(resp) {
55+
assets, err := extractor.M3U8(resp)
56+
if err == nil {
57+
c.captureAssets(item, assets, cookies, headers)
58+
} else {
59+
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract URLs from M3U8")
60+
}
61+
}
62+
63+
io.Copy(io.Discard, resp.Body)
64+
65+
return nil
66+
}
67+
68+
func (c *Crawl) captureAssets(item *queue.Item, assets []*url.URL, cookies []*http.Cookie, headers map[string]string) {
69+
// TODO: implement a counter for the number of assets
70+
// currently being processed
71+
// c.Frontier.QueueCount.Incr(int64(len(assets)))
72+
swg := sizedwaitgroup.New(int(c.MaxConcurrentAssets))
73+
excluded := false
74+
75+
for _, asset := range assets {
76+
// TODO: implement a counter for the number of assets
77+
// currently being processed
78+
// c.Frontier.QueueCount.Incr(-1)
79+
80+
// Just making sure we do not over archive by archiving the original URL
81+
if utils.URLToString(item.URL) == utils.URLToString(asset) {
82+
continue
83+
}
84+
85+
// If the URL match any excluded string, we ignore it
86+
for _, excludedString := range c.ExcludedStrings {
87+
if strings.Contains(utils.URLToString(asset), excludedString) {
88+
excluded = true
89+
break
90+
}
91+
}
92+
93+
if excluded {
94+
excluded = false
95+
continue
96+
}
97+
98+
swg.Add()
99+
c.URIsPerSecond.Incr(1)
100+
101+
go func(asset *url.URL, swg *sizedwaitgroup.SizedWaitGroup) {
102+
defer swg.Done()
103+
104+
// Create the asset's item
105+
newAsset, err := queue.NewItem(asset, item.URL, "asset", item.Hop, "", false)
106+
if err != nil {
107+
c.Log.WithFields(c.genLogFields(err, asset, map[string]interface{}{
108+
"parentHop": item.Hop,
109+
"parentUrl": utils.URLToString(item.URL),
110+
"type": "asset",
111+
})).Error("error while creating asset item")
112+
return
113+
}
114+
115+
// Capture the asset
116+
err = c.captureAsset(newAsset, cookies, headers)
117+
if err != nil {
118+
c.Log.WithFields(c.genLogFields(err, &asset, map[string]interface{}{
119+
"parentHop": item.Hop,
120+
"parentUrl": utils.URLToString(item.URL),
121+
"type": "asset",
122+
})).Error("error while capturing asset")
123+
return
124+
}
125+
126+
// If we made it to this point, it means that the asset have been crawled successfully,
127+
// then we can increment the locallyCrawled variable
128+
atomic.AddUint64(&item.LocallyCrawled, 1)
129+
}(asset, &swg)
130+
}
131+
132+
swg.Wait()
133+
}
134+
18135
func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Document) (assets []*url.URL, err error) {
19136
var rawAssets []string
20137
var URL = utils.URLToString(item.URL)
@@ -198,7 +315,7 @@ func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Docu
198315
if err != nil {
199316
c.Log.Error("unable to extract URLs from JSON in script tag", "error", err, "url", URL)
200317
} else {
201-
rawAssets = append(rawAssets, removeGoogleVideoURLs(URLsFromJSON)...)
318+
rawAssets = append(rawAssets, URLsFromJSON...)
202319
}
203320
}
204321
}
@@ -274,21 +391,26 @@ func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Docu
274391
// Turn strings into url.URL
275392
assets = append(assets, utils.StringSliceToURLSlice(rawAssets)...)
276393

277-
// Ensure that excluded hosts aren't in the assets.
278-
assets = c.excludeHosts(assets)
279-
280-
// Go over all assets and outlinks and make sure they are absolute links
281-
assets = utils.MakeAbsolute(base, assets)
394+
// Ensure that no asset that would be excluded is added to the list,
395+
// remove all fragments, and make sure that all assets are absolute URLs
396+
assets = c.cleanURLs(base, assets)
282397

283398
return utils.DedupeURLs(assets), nil
284399
}
285400

286-
func removeGoogleVideoURLs(input []string) (output []string) {
287-
for _, i := range input {
288-
if !strings.Contains(i, "googlevideo.com") {
289-
output = append(output, i)
401+
func (c *Crawl) cleanURLs(base *url.URL, URLs []*url.URL) (output []*url.URL) {
402+
// Remove excluded URLs
403+
for _, URL := range URLs {
404+
if !c.isExcluded(URL) {
405+
output = append(output, URL)
290406
}
291407
}
292408

293-
return output
409+
// Make all URLs absolute
410+
if base != nil {
411+
output = utils.MakeAbsolute(base, output)
412+
}
413+
414+
// Remove fragments
415+
return utils.RemoveFragments(output)
294416
}

0 commit comments

Comments
 (0)