Skip to content

Commit 1fca2f0

Browse files
committed
add: JW player assets for ina.fr
1 parent bbfd8dc commit 1fca2f0

File tree

2 files changed

+88
-2
lines changed

2 files changed

+88
-2
lines changed

internal/pkg/crawl/capture.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -532,7 +532,7 @@ func (c *Crawl) Capture(item *queue.Item) error {
532532
})).Info("URL archived")
533533
}
534534
} else if ina.IsURL(req) {
535-
playerURLs := ina.ExtractPlayerURLs(doc)
535+
playerURLs := ina.ExtractPlayerURLs(doc, c.Client)
536536

537537
for _, playerURL := range playerURLs {
538538
playerItem, err := queue.NewItem(playerURL, item.URL, "seed", 0, "", false)

internal/pkg/crawl/sitespecific/ina/ina.go

Lines changed: 87 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,26 @@ import (
55
"io"
66
"net/http"
77
"net/url"
8+
"regexp"
89
"strings"
10+
"sync"
911
"time"
1012

13+
"github.com/CorentinB/warc"
1114
"github.com/PuerkitoBio/goquery"
1215
"github.com/internetarchive/Zeno/internal/pkg/utils"
1316
)
1417

18+
var (
19+
playerVersion string
20+
playerVersionLock sync.Mutex
21+
playerRegex *regexp.Regexp
22+
)
23+
24+
func init() {
25+
playerRegex = regexp.MustCompile(`"//ssl\.p\.jwpcdn\.com[^"]+\.js"`)
26+
}
27+
1528
type APIResponse struct {
1629
ID string `json:"id"`
1730
Title string `json:"title"`
@@ -69,7 +82,7 @@ func IsAPIURL(req *http.Request) bool {
6982
return strings.Contains(utils.URLToString(req.URL), "apipartner.ina.fr") && !strings.Contains(utils.URLToString(req.URL), "playerConfigurations.json")
7083
}
7184

72-
func ExtractPlayerURLs(doc *goquery.Document) []*url.URL {
85+
func ExtractPlayerURLs(doc *goquery.Document, c *warc.CustomHTTPClient) []*url.URL {
7386
var assets []string
7487

7588
doc.Find("div[data-type=player]").Each(func(i int, s *goquery.Selection) {
@@ -86,9 +99,82 @@ func ExtractPlayerURLs(doc *goquery.Document) []*url.URL {
8699
}
87100
})
88101

102+
assets = append(assets, getJWPlayerURLs(c)...)
103+
89104
return utils.StringSliceToURLSlice(assets)
90105
}
91106

107+
func getJWPlayerURLs(c *warc.CustomHTTPClient) (URLs []string) {
108+
playerVersionLock.Lock()
109+
defer playerVersionLock.Unlock()
110+
111+
if playerVersion == "" {
112+
resp, err := c.Get("https://player-hub.ina.fr/version")
113+
if err != nil {
114+
return URLs
115+
}
116+
defer resp.Body.Close()
117+
118+
if resp.StatusCode != http.StatusOK {
119+
return URLs
120+
}
121+
122+
body, err := io.ReadAll(resp.Body)
123+
if err != nil {
124+
return URLs
125+
}
126+
127+
playerVersion = string(body)
128+
129+
URLs = append(URLs,
130+
"https://player-hub.ina.fr/dist/ina-player.min.js?version="+playerVersion,
131+
"https://player-hub.ina.fr/dist/player-default-skin.min.css?version="+playerVersion,
132+
"https://player-hub.ina.fr/assets/player/svg/pause.svg",
133+
"https://player-hub.ina.fr/assets/player/svg/play.svg",
134+
"https://player-hub.ina.fr/assets/player/svg/backward.svg",
135+
"https://player-hub.ina.fr/assets/player/svg/forward.svg",
136+
)
137+
138+
// Get the JWPlayer JS code
139+
playerResp, err := c.Get("https://player-hub.ina.fr/js/jwplayer/jwplayer.js?version=" + playerVersion)
140+
if err != nil {
141+
return URLs
142+
}
143+
defer playerResp.Body.Close()
144+
145+
if playerResp.StatusCode != http.StatusOK {
146+
return URLs
147+
}
148+
149+
// Find the JWPlayer assets in the JS file
150+
body, err = io.ReadAll(playerResp.Body)
151+
if err != nil {
152+
return URLs
153+
}
154+
155+
matches := playerRegex.FindAllString(string(body), -1)
156+
157+
// Clean up the matches (remove quotes)
158+
for _, match := range matches {
159+
URLs = append(URLs, "https:"+match[1:len(match)-1])
160+
}
161+
162+
URLs = append(URLs, "https://ssl.p.jwpcdn.com/player/v/"+extractJWPlayerVersion(string(body))+"/jwplayer.core.controls.html5.js")
163+
}
164+
165+
return URLs
166+
}
167+
168+
func extractJWPlayerVersion(body string) string {
169+
lines := strings.Split(body, "\n")
170+
for _, line := range lines {
171+
if strings.Contains(line, "JW Player version") {
172+
return strings.Split(line, "JW Player version ")[1]
173+
}
174+
}
175+
return ""
176+
}
177+
92178
func ExtractMedias(resp *http.Response) ([]*url.URL, error) {
93179
var assets []string
94180

0 commit comments

Comments
 (0)