Skip to content

Commit c94b156

Browse files
tooryxcopybara-github
authored andcommitted
Silently ignore un-parsable links such as data: when crawling.
PiperOrigin-RevId: 871251237
1 parent 4ea086c commit c94b156

File tree

2 files changed

+24
-9
lines changed

2 files changed

+24
-9
lines changed

common/clients/httpcrawler/parser/parser.go

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,15 @@ import (
3333
var (
3434
// ErrParseURL is returned when the URL fails to parse.
3535
ErrParseURL = errors.New("failed to parse URL")
36-
// ErrUnsupportedURLType is returned when the URL is a javascript or mailto URL.
37-
ErrUnsupportedURLType = errors.New("unsupported javascript/mailto URL type")
3836
// ErrUnsupportedScheme is returned when the URL scheme is not supported.
3937
ErrUnsupportedScheme = errors.New("unsupported scheme")
4038

39+
unsupportedPrefixes = []string{
40+
"data:",
41+
"javascript:",
42+
"mailto:",
43+
}
44+
4145
knownLinkAttributes = []string{
4246
// HTML 4 link attributes.
4347
"action",
@@ -109,8 +113,10 @@ func processHTMLNode(rootURL string, node *html.Node) ([]string, error) {
109113
}
110114

111115
func parseURL(base string, redirect string) (string, error) {
112-
if strings.HasPrefix(redirect, "javascript:") || strings.HasPrefix(redirect, "mailto:") {
113-
return "", ErrUnsupportedURLType
116+
for _, prefix := range unsupportedPrefixes {
117+
if strings.HasPrefix(redirect, prefix) {
118+
return "", nil
119+
}
114120
}
115121

116122
redirurl, err := url.Parse(redirect)

common/clients/httpcrawler/parser/parser_test.go

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -229,16 +229,25 @@ func TestParseURL(t *testing.T) {
229229
wantErr: ErrUnsupportedScheme,
230230
},
231231
{
232-
name: "when_node_is_javascript_returns_error",
232+
name: "when_node_is_javascript_returns_nothing",
233233
rootURL: "http://domain.com/",
234-
nodeURL: "javascript:alert('Evil XSS')",
235-
wantErr: ErrUnsupportedURLType,
234+
nodeURL: "javascript:alert('hello')",
235+
wantErr: nil,
236+
want: "",
236237
},
237238
{
238-
name: "when_node_is_mailto_returns_error",
239+
name: "when_node_is_mailto_returns_nothing",
239240
rootURL: "http://domain.com/",
240241
nodeURL: "mailto:someone@domain.com",
241-
wantErr: ErrUnsupportedURLType,
242+
wantErr: nil,
243+
want: "",
244+
},
245+
{
246+
name: "when_node_is_data_url_returns_nothing",
247+
rootURL: "http://domain.com/",
248+
nodeURL: "data:text/plain;base64,SGVsbG8sIFdvcmxkIQ==",
249+
wantErr: nil,
250+
want: "",
242251
},
243252
}
244253

0 commit comments

Comments
 (0)