Skip to content

Commit dbd3a75

Browse files
committed
Scraper: fixes #4 (http header fix)
1 parent df49c23 commit dbd3a75

File tree

1 file changed

+20
-0
lines changed

1 file changed

+20
-0
lines changed

mildew/dir.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,28 @@ import (
44
"context"
55
"encoding/json"
66
"fmt"
7+
"net/http"
78
"strings"
89

910
"github.com/gocolly/colly/v2"
1011
)
1112

13+
const defUserAgent = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:136.0) Gecko/20100101 Firefox/136.0"
14+
15+
var head = make(http.Header)
16+
17+
func init() {
18+
head.Add("User-Agent", defUserAgent)
19+
head.Add("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
20+
head.Add("Accept-Language", "en-US,en;q=0.5")
21+
head.Add("Upgrade-Insecure-Requests", "1")
22+
head.Add("Sec-Fetch-Site", "none")
23+
head.Add("Sec-Fetch-User", "?1")
24+
head.Add("Accept-Encoding", "gzip, deflate, br, zstd")
25+
head.Add("Priority", "u=0, i")
26+
head.Add("Te", "trailers")
27+
}
28+
1229
// ScrapeDirs scrapes all DoD website directories and saves to Mildew object's Subs field
1330
func (mw *Mildew) ScrapeDirs(ctx context.Context) error {
1431
dirStream := make(chan string)
@@ -18,6 +35,9 @@ func (mw *Mildew) ScrapeDirs(ctx context.Context) error {
1835
// Initialize base colly collector to be used by each directory scraper function
1936
// TODO tune colly options
2037
c := colly.NewCollector()
38+
c.UserAgent = defUserAgent
39+
c.Headers = &head
40+
2141
var err error
2242

2343
err = dirDod(c, dirStream)

0 commit comments

Comments
 (0)