Skip to content

Commit bbfd8dc

Browse files
committed
add: ina.fr video archiving support
1 parent bd64372 commit bbfd8dc

File tree

7 files changed

+162
-20
lines changed

7 files changed

+162
-20
lines changed

internal/pkg/crawl/assets.go

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,7 @@ func (c *Crawl) seencheckAssets(assets []*url.URL, item *queue.Item) []*url.URL
164164
if found {
165165
continue
166166
}
167+
167168
seencheckedBatch = append(seencheckedBatch, URL)
168169
}
169170

@@ -183,15 +184,13 @@ func (c *Crawl) extractAssets(base *url.URL, item *queue.Item, doc *goquery.Docu
183184
var URL = utils.URLToString(item.URL)
184185

185186
// Execute plugins on the response
186-
if strings.Contains(base.Host, "cloudflarestream.com") {
187+
if cloudflarestream.IsURL(URL) {
187188
cloudflarestreamURLs, err := cloudflarestream.GetSegments(base, *c.Client)
188189
if err != nil {
189190
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Warn("error getting cloudflarestream segments")
190191
}
191192

192-
if len(cloudflarestreamURLs) > 0 {
193-
assets = append(assets, cloudflarestreamURLs...)
194-
}
193+
assets = append(assets, cloudflarestreamURLs...)
195194
}
196195

197196
// Get assets from JSON payloads in data-item values

internal/pkg/crawl/capture.go

Lines changed: 37 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515
"github.com/internetarchive/Zeno/internal/pkg/crawl/extractor"
1616
"github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/cloudflarestream"
1717
"github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/facebook"
18+
"github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/ina"
1819
"github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/libsyn"
1920
"github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/reddit"
2021
"github.com/internetarchive/Zeno/internal/pkg/crawl/sitespecific/telegram"
@@ -309,7 +310,7 @@ func (c *Crawl) Capture(item *queue.Item) error {
309310
}
310311
} else if vk.IsVKURL(utils.URLToString(item.URL)) {
311312
vk.AddHeaders(req)
312-
} else if reddit.IsRedditURL(utils.URLToString(item.URL)) {
313+
} else if reddit.IsURL(utils.URLToString(item.URL)) {
313314
reddit.AddCookies(req)
314315
}
315316

@@ -392,15 +393,12 @@ func (c *Crawl) Capture(item *queue.Item) error {
392393
}
393394

394395
return nil
395-
} else if reddit.IsRedditPostAPI(req) {
396-
body, err := io.ReadAll(resp.Body)
396+
} else if reddit.IsPostAPI(req) {
397+
permalinks, rawAssets, err := reddit.ExtractPost(resp)
397398
if err != nil {
398-
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while reading response body")
399-
return err
399+
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract post from Reddit")
400400
}
401401

402-
permalinks, rawAssets, err := reddit.ExtractPost(body)
403-
404402
// Queue the permalinks
405403
waitGroup.Add(1)
406404
go c.queueOutlinks(utils.StringSliceToURLSlice(permalinks), item, &waitGroup)
@@ -416,6 +414,26 @@ func (c *Crawl) Capture(item *queue.Item) error {
416414
}
417415

418416
return nil
417+
} else if ina.IsAPIURL(req) {
418+
rawAssets, err := ina.ExtractMedias(resp)
419+
if err != nil {
420+
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to extract medias from INA")
421+
}
422+
423+
if len(rawAssets) != 0 {
424+
assets = c.seencheckAssets(rawAssets, item)
425+
426+
if len(assets) != 0 {
427+
for _, asset := range rawAssets {
428+
playerItem, err := queue.NewItem(asset, item.URL, "seed", 0, "", false)
429+
if err != nil {
430+
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to create new item from asset")
431+
} else {
432+
c.Capture(playerItem)
433+
}
434+
}
435+
}
436+
}
419437
}
420438

421439
// Scrape potential URLs from Link HTTP header
@@ -480,7 +498,7 @@ func (c *Crawl) Capture(item *queue.Item) error {
480498
}
481499

482500
// Execute site-specific code on the document
483-
if strings.Contains(base.Host, "cloudflarestream.com") {
501+
if cloudflarestream.IsURL(base.Host) {
484502
// Look for JS files necessary for the playback of the video
485503
cfstreamURLs, err := cloudflarestream.GetJSFiles(doc, base, *c.Client)
486504
if err != nil {
@@ -513,6 +531,17 @@ func (c *Crawl) Capture(item *queue.Item) error {
513531
"type": "asset",
514532
})).Info("URL archived")
515533
}
534+
} else if ina.IsURL(req) {
535+
playerURLs := ina.ExtractPlayerURLs(doc)
536+
537+
for _, playerURL := range playerURLs {
538+
playerItem, err := queue.NewItem(playerURL, item.URL, "seed", 0, "", false)
539+
if err != nil {
540+
c.Log.WithFields(c.genLogFields(err, item.URL, nil)).Error("unable to create new item from player URL")
541+
} else {
542+
c.Capture(playerItem)
543+
}
544+
}
516545
}
517546

518547
// Websites can use a <base> tag to specify a base for relative URLs in every other tags.

internal/pkg/crawl/sitespecific/cloudflarestream/cloudflarestream.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,10 @@ type MPD struct {
5959
} `xml:"Period"`
6060
}
6161

62+
func IsURL(URL string) bool {
63+
return strings.Contains(URL, "cloudflarestream.com")
64+
}
65+
6266
func GetJSFiles(doc *goquery.Document, watchPageURL *url.URL, httpClient warc.CustomHTTPClient) (archivedURLs []string, err error) {
6367
var latestJSURL string
6468

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
package ina
2+
3+
import (
4+
"encoding/json"
5+
"io"
6+
"net/http"
7+
"net/url"
8+
"strings"
9+
"time"
10+
11+
"github.com/PuerkitoBio/goquery"
12+
"github.com/internetarchive/Zeno/internal/pkg/utils"
13+
)
14+
15+
type APIResponse struct {
16+
ID string `json:"id"`
17+
Title string `json:"title"`
18+
Description string `json:"description"`
19+
DateOfBroadcast time.Time `json:"dateOfBroadcast"`
20+
Type string `json:"type"`
21+
Duration int `json:"duration"`
22+
Categories []any `json:"categories"`
23+
Credits []struct {
24+
Context struct {
25+
Vocab string `json:"@vocab"`
26+
Hydra string `json:"hydra"`
27+
Name string `json:"name"`
28+
Value string `json:"value"`
29+
Attributes string `json:"attributes"`
30+
} `json:"@context"`
31+
Type string `json:"@type"`
32+
ID string `json:"@id"`
33+
Name string `json:"name"`
34+
Value string `json:"value"`
35+
Attributes []struct {
36+
Context struct {
37+
Vocab string `json:"@vocab"`
38+
Hydra string `json:"hydra"`
39+
Key string `json:"key"`
40+
Value string `json:"value"`
41+
} `json:"@context"`
42+
Type string `json:"@type"`
43+
ID string `json:"@id"`
44+
Key string `json:"key"`
45+
Value string `json:"value"`
46+
} `json:"attributes"`
47+
} `json:"credits"`
48+
Restrictions []any `json:"restrictions"`
49+
ResourceURL string `json:"resourceUrl"`
50+
ResourceThumbnail string `json:"resourceThumbnail"`
51+
RestrictedBroadcastCountries []any `json:"restrictedBroadcastCountries"`
52+
EmbedURL string `json:"embedUrl"`
53+
AllowEmbed bool `json:"allowEmbed"`
54+
Ratio string `json:"ratio"`
55+
CollectionTitle string `json:"collectionTitle"`
56+
IsOnline bool `json:"isOnline"`
57+
AllowAds bool `json:"allowAds"`
58+
TypeMedia string `json:"typeMedia"`
59+
HideLogo bool `json:"hideLogo"`
60+
URI string `json:"uri"`
61+
AdvertisingAsset bool `json:"advertisingAsset"`
62+
}
63+
64+
func IsURL(req *http.Request) bool {
65+
return strings.Contains(utils.URLToString(req.URL), "ina.fr")
66+
}
67+
68+
func IsAPIURL(req *http.Request) bool {
69+
return strings.Contains(utils.URLToString(req.URL), "apipartner.ina.fr") && !strings.Contains(utils.URLToString(req.URL), "playerConfigurations.json")
70+
}
71+
72+
func ExtractPlayerURLs(doc *goquery.Document) []*url.URL {
73+
var assets []string
74+
75+
doc.Find("div[data-type=player]").Each(func(i int, s *goquery.Selection) {
76+
if playerConfigURL, exists := s.Attr("config-url"); exists {
77+
assets = append(assets, playerConfigURL)
78+
}
79+
80+
if assetDetailsURL, exists := s.Attr("asset-details-url"); exists {
81+
assets = append(assets, assetDetailsURL)
82+
}
83+
84+
if posterURL, exists := s.Attr("poster"); exists {
85+
assets = append(assets, posterURL)
86+
}
87+
})
88+
89+
return utils.StringSliceToURLSlice(assets)
90+
}
91+
92+
func ExtractMedias(resp *http.Response) ([]*url.URL, error) {
93+
var assets []string
94+
95+
body, err := io.ReadAll(resp.Body)
96+
if err != nil {
97+
return nil, err
98+
}
99+
100+
var data APIResponse
101+
err = json.Unmarshal(body, &data)
102+
if err != nil {
103+
return nil, err
104+
}
105+
106+
assets = append(assets, data.ResourceURL, data.ResourceThumbnail, "https://player.ina.fr"+data.EmbedURL, data.URI)
107+
108+
return utils.StringSliceToURLSlice(assets), nil
109+
}

internal/pkg/crawl/sitespecific/reddit/post.go

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package reddit
33
import (
44
"encoding/json"
55
"fmt"
6+
"io"
67
"net/http"
78
"net/url"
89
"strings"
@@ -184,11 +185,16 @@ type Post struct {
184185
} `json:"data"`
185186
}
186187

187-
func IsRedditPostAPI(req *http.Request) bool {
188+
func IsPostAPI(req *http.Request) bool {
188189
return strings.Contains(utils.URLToString(req.URL), "reddit.com/api/info.json?id=t3_")
189190
}
190191

191-
func ExtractPost(body []byte) (permalinks []string, assets []string, err error) {
192+
func ExtractPost(resp *http.Response) (permalinks []string, assets []string, err error) {
193+
body, err := io.ReadAll(resp.Body)
194+
if err != nil {
195+
return permalinks, assets, err
196+
}
197+
192198
var data Post
193199
err = json.Unmarshal(body, &data)
194200
if err != nil {

internal/pkg/crawl/sitespecific/reddit/reddit.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ import (
55
"strings"
66
)
77

8-
func IsRedditURL(URL string) bool {
8+
func IsURL(URL string) bool {
99
return strings.Contains(URL, "reddit.com")
1010
}
1111

internal/pkg/upload/upload.go

Lines changed: 0 additions & 5 deletions
This file was deleted.

0 commit comments

Comments
 (0)