Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 35 additions & 53 deletions internal/pkg/postprocessor/assets.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package postprocessor

import (
"fmt"
"net/url"
"slices"
"strings"
Expand All @@ -21,62 +22,44 @@ func ExtractAssetsOutlinks(item *models.Item) (assets, outlinks []*models.URL, e
return SanitizeAssetsOutlinks(item, assets, outlinks, err)
}

// Extract assets and outlinks from the body using the appropriate extractor
// Order is important, we want to check for more specific things first,
// as they may trigger more general extractors (e.g. HTML)
// TODO this should be refactored using interfaces
type AssetExtractor interface {
Match(*models.URL) bool
Extract(*models.Item) (assets, outlinks []*models.URL, err error)
}

// Order matters: site-specific extractors are checked first, then
// general-purpose ones. The first match wins, so more specific
// extractors must precede broader ones (e.g. HTML).
var assetExtractors = []AssetExtractor{
ina.INAExtractor{},
truthsocial.TruthsocialExtractor{},
extractor.M3U8Extractor{},
extractor.JSONExtractor{},
extractor.XMLExtractor{},
extractor.HTMLAssetsExtractor{},
}

func Extractors(item *models.Item) (assets, outlinks []*models.URL, err error) {
logger := log.NewFieldedLogger(&log.Fields{
"component": "postprocessor.Extractors",
"item": item.GetShortID(),
})

switch {
case ina.IsAPIURL(item.GetURL()):
INAAssets, err := ina.ExtractMedias(item.GetURL())
if err != nil {
logger.Error("unable to extract medias from INA", "err", err.Error())
return assets, outlinks, err
}

HTMLAssets, err := extractor.HTMLAssets(item)
if err != nil {
logger.Error("unable to extract assets", "err", err.Error())
for _, ext := range assetExtractors {
// heavy debug log calls, can be ommited when merged
logger.Debug("AssetExtractor Match call", "url", item.GetURL())
if ext.Match(item.GetURL()) {
logger.Debug("matched extractor", "extractor", fmt.Sprintf("%T", ext))
assets, outlinks, err = ext.Extract(item)
logger.Debug("extraction result", "assets", len(assets), "outlinks", len(outlinks), "err", err)
if err != nil {
logger.Error("unable to extract assets", "err", err.Error())
}
return assets, outlinks, err
}
}

assets = append(INAAssets, HTMLAssets...)
case truthsocial.NeedExtraction(item.GetURL()):
assets, outlinks, err = truthsocial.ExtractAssets(item)
if err != nil {
logger.Error("unable to extract assets from TruthSocial", "err", err.Error())
return assets, outlinks, err
}
case extractor.IsM3U8(item.GetURL()):
assets, err = extractor.M3U8(item.GetURL())
if err != nil {
logger.Error("unable to extract assets", "err", err.Error())
return assets, outlinks, err
}
case extractor.IsJSON(item.GetURL()):
assets, outlinks, err = extractor.JSON(item.GetURL())
if err != nil {
logger.Error("unable to extract assets", "err", err.Error())
return assets, outlinks, err
}
case extractor.IsXML(item.GetURL()):
assets, outlinks, err = extractor.XML(item.GetURL())
if err != nil {
logger.Error("unable to extract assets", "err", err.Error())
return assets, outlinks, err
}
case extractor.IsHTML(item.GetURL()):
assets, err = extractor.HTMLAssets(item)
if err != nil {
logger.Error("unable to extract assets", "err", err.Error())
return assets, outlinks, err
}
case extractor.IsEmbeddedCSS(item):
// Embedded CSS is handled separately see PR discussion
if extractor.IsEmbeddedCSS(item) {
var atImportLinks []*models.URL
assets, atImportLinks, err = extractor.ExtractFromURLCSS(item.GetURL())

Expand All @@ -88,13 +71,12 @@ func Extractors(item *models.Item) (assets, outlinks []*models.URL, err error) {
logger.Debug("extracted assets from CSS", logArgs...)
}
extractor.AddAtImportLinksToItemChild(item, atImportLinks)
default:
contentType := item.GetURL().GetResponse().Header.Get("Content-Type")
logger.Debug("no extractor used for page", "content-type", contentType, "mime", item.GetURL().GetMIMEType().String())
return assets, outlinks, nil
return assets, outlinks, err
}

return assets, outlinks, err
contentType := item.GetURL().GetResponse().Header.Get("Content-Type")
logger.Debug("no extractor used for page", "content-type", contentType, "mime", item.GetURL().GetMIMEType().String())
return assets, outlinks, nil
}

func SanitizeAssetsOutlinks(item *models.Item, assets []*models.URL, outlinks []*models.URL, err error) ([]*models.URL, []*models.URL, error) {
Expand Down
11 changes: 11 additions & 0 deletions internal/pkg/postprocessor/extractor/html.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,17 @@ import (
"github.com/internetarchive/Zeno/pkg/models"
)

type HTMLAssetsExtractor struct{}

func (HTMLAssetsExtractor) Match(URL *models.URL) bool {
return IsHTML(URL)
}

func (HTMLAssetsExtractor) Extract(item *models.Item) (assets, outlinks []*models.URL, err error) {
assets, err = HTMLAssets(item)
return assets, nil, err
}

var (
onclickRegex = regexp.MustCompile(`window\.location(?:\.href)?\s*=\s*['"]([^'"]+)['"]`)
)
Expand Down
18 changes: 12 additions & 6 deletions internal/pkg/postprocessor/extractor/html_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,8 @@ func TestHTMLAssetsAudioVideo(t *testing.T) {
</html>`
item := setupItem(html)

assets, err := HTMLAssets(item)
extractor := HTMLAssetsExtractor{}
assets, _, err := extractor.Extract(item)
if err != nil {
t.Errorf("HTMLAssets error = %v", err)
}
Expand All @@ -102,7 +103,8 @@ func TestHTMLAssetsAttributes(t *testing.T) {
</html>`
item := setupItem(html)

assets, err := HTMLAssets(item)
extractor := HTMLAssetsExtractor{}
assets, _, err := extractor.Extract(item)
if err != nil {
t.Errorf("HTMLAssets error = %v", err)
}
Expand Down Expand Up @@ -144,7 +146,8 @@ func TestHTMLAssetsMeta(t *testing.T) {
}
item := setupItem(html)

assets, err := HTMLAssets(item)
extractor := HTMLAssetsExtractor{}
assets, _, err := extractor.Extract(item)
if err != nil {
t.Errorf("HTMLAssets error = %v", err)
}
Expand All @@ -171,7 +174,8 @@ func TestSrcset(t *testing.T) {
</body>
</html>`
item := setupItem(html)
assets, err := HTMLAssets(item)
extractor := HTMLAssetsExtractor{}
assets, _, err := extractor.Extract(item)
if err != nil {
t.Errorf("Error extracting HTML assets %s", err)
}
Expand Down Expand Up @@ -225,7 +229,8 @@ func TestCSS(t *testing.T) {
</body>
</html>`
item := setupItem(html)
assets, err := HTMLAssets(item)
extractor := HTMLAssetsExtractor{}
assets, _, err := extractor.Extract(item)
if err != nil {
t.Errorf("Error extracting HTML assets %s", err)
}
Expand All @@ -247,7 +252,8 @@ func TestHTMLDataSrc(t *testing.T) {
</html>
`
item := setupItem(html)
assets, err := HTMLAssets(item)
extractor := HTMLAssetsExtractor{}
assets, _, err := extractor.Extract(item)
if err != nil {
t.Errorf("Error extracting HTML assets %s", err)
}
Expand Down
10 changes: 10 additions & 0 deletions internal/pkg/postprocessor/extractor/json.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,16 @@ import (
"github.com/internetarchive/Zeno/pkg/models"
)

type JSONExtractor struct{}

func (JSONExtractor) Match(URL *models.URL) bool {
return IsJSON(URL)
}

func (JSONExtractor) Extract(item *models.Item) (assets, outlinks []*models.URL, err error) {
return JSON(item.GetURL())
}

func IsJSON(URL *models.URL) bool {
return URL.GetMIMEType() != nil && strings.Contains(URL.GetMIMEType().String(), "json")
}
Expand Down
16 changes: 14 additions & 2 deletions internal/pkg/postprocessor/extractor/m3u8.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,21 @@ import (
"github.com/internetarchive/Zeno/pkg/models"
)

type M3U8Extractor struct{}

func (M3U8Extractor) Match(URL *models.URL) bool {
return IsM3U8(URL)
}

func (M3U8Extractor) Extract(item *models.Item) (assets, outlinks []*models.URL, err error) {
assets, err = M3U8(item.GetURL())
return assets, nil, err
}

func IsM3U8(URL *models.URL) bool {
return URL.GetMIMEType() != nil &&
URL.GetMIMEType().Is("application/vnd.apple.mpegurl") || URL.GetMIMEType().Is("application/x-mpegURL")
mt := URL.GetMIMEType()
// TODO: https://github.com/gabriel-vasile/mimetype/pull/755 remove "application/x-mpegURL" when merged&released
return mt != nil && (mt.Is("application/vnd.apple.mpegurl") || mt.Is("application/x-mpegURL"))
}

func M3U8(URL *models.URL) (assets []*models.URL, err error) {
Expand Down
48 changes: 48 additions & 0 deletions internal/pkg/postprocessor/extractor/m3u8_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
package extractor

import (
"net/http"
"testing"

"github.com/internetarchive/Zeno/pkg/models"
)

func TestShouldMatchM3U8URL(t *testing.T) {
cases := []struct {
url string
mimeType string
expected bool
}{
{"https://sub.example.com/test.m3u8", "application/vnd.apple.mpegurl", true},
{"https://sub.example.com/test2.m3u8", "application/x-mpegURL", true}, // will be fixed by PRhttps://github.com/gabriel-vasile/mimetype/pull/755
{"https://sub.example.com/test3.m3u8", "application/json", false},
{"https://sub.example.com/example.html", "text/html", false},
{"https://sub.example.com/m3u8.txt", "text/plain", false},
{"https://sub.example.com/example.mp4", "application/octet-stream", false},
{"https://sub.example.com/example.form", "application/x-www-form-urlencoded", false},
}

for _, c := range cases {
t.Run(c.url, func(t *testing.T) {
url, err := models.NewURL(c.url)
if err != nil {
t.Fatalf("failed to create URL: %v", err)
}
resp := &http.Response{
Header: make(http.Header),
Body: nil,
StatusCode: 200,
}
resp.Header.Set("Content-Type", c.mimeType)
url.SetResponse(resp)

// call match, returns bool
matched := M3U8Extractor{}.Match(&url)
if matched != c.expected {
t.Errorf("M3U8Extractor.Match(%q) = %v, want %v: mimetype=%q", c.url, matched, c.expected, url.GetMIMEType())
}
})
}
}

// TODO: Add test for Extract()
11 changes: 10 additions & 1 deletion internal/pkg/postprocessor/extractor/xml.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,16 @@ import (
"github.com/internetarchive/Zeno/pkg/models"
)

type XMLExtractor struct{}

func (XMLExtractor) Match(URL *models.URL) bool {
return IsXML(URL)
}

func (XMLExtractor) Extract(item *models.Item) (assets, outlinks []*models.URL, err error) {
return XML(item.GetURL())
}

// xmlBufioReaderPool pools bufio.Reader instances for XML parsing to reduce allocations when processing many XML documents.
var xmlBufioReaderPool = sync.Pool{
New: func() any {
Expand Down Expand Up @@ -110,7 +120,6 @@ func XML(URL *models.URL) (assets, outlinks []*models.URL, err error) {
body.Reset(URL.GetBody())
defer xmlBufioReaderPool.Put(body)


// Peek to check if body has any non-whitespace content
peek, err := body.Peek(512) // peek up to 512 bytes
if err != nil && err != io.EOF {
Expand Down
1 change: 0 additions & 1 deletion internal/pkg/postprocessor/item.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,6 @@ func postprocessItem(item *models.Item) []*models.Item {

if (item.GetURL().GetResponse() != nil && item.GetURL().GetResponse().StatusCode == 200) || // standard item
(item.GetURL().GetResponse() == nil && item.GetURL().GetBody() != nil) { // headless item
logger.Debug("item is a success")

var outlinksFromAssets []*models.URL

Expand Down
16 changes: 16 additions & 0 deletions internal/pkg/postprocessor/sitespecific/ina/ina.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,21 @@ import (
warc "github.com/internetarchive/gowarc"
)

type INAExtractor struct{}

func (INAExtractor) Match(URL *models.URL) bool {
return IsAPIURL(URL)
}

func (INAExtractor) Extract(item *models.Item) (assets, outlinks []*models.URL, err error) {
assets, err = ExtractMedias(item.GetURL())
if err != nil {
return nil, nil, err
}

return assets, nil, err
}

var (
playerVersion string
playerVersionLock sync.Mutex
Expand Down Expand Up @@ -176,6 +191,7 @@ func extractJWPlayerVersion(body string) string {
return ""
}

// TODO: Use the item obj directly instead of the URL
func ExtractMedias(URL *models.URL) (assets []*models.URL, err error) {
defer URL.RewindBody()

Expand Down
Loading
Loading