Skip to content

Commit 2c56094

Browse files
committed
Merge branch 'release/v0.3.0-alpha'
2 parents 7f4882e + a1625fa commit 2c56094

30 files changed

+223
-858
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
/.idea

CHANGELOG.md

+5
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,11 @@ All notable changes to this project will be documented in this file.
44
The format is based on [Keep a Changelog](http://keepachangelog.com/)
55
and this project adheres to [Semantic Versioning](http://semver.org/).
66

7+
## [Unreleased]
8+
9+
### Added
10+
- Add "Save downloaded data to disk" to the roadmap (feature request #1)
11+
712
## [v0.2.0-alpha] - 2017-02-07
813

914
### Fixed

README.md

+10-2
Original file line numberDiff line numberDiff line change
@@ -25,12 +25,20 @@ gargantua crawl --url https://www.sitemaps.org/sitemap.xml --workers 5
2525

2626
see also: [A short introduction video of gargantua on YouTube](https://www.youtube.com/watch?v=TSCMvUvc0qo)
2727

28+
### Customize the user-agent
29+
30+
You can specify a customized user agent using the `--user-agent` argument:
31+
32+
```bash
33+
gargantua crawl --url https://www.sitemaps.org/sitemap.xml --workers 5 --user-agent "gargantua bot / iPhone"
34+
```
35+
2836
## Download
2937

3038
You can download binaries for Linux, macOS and Windows from [github.com »andreaskoch » gargantua » releases](https://github.com/andreaskoch/gargantua/releases):
3139

3240
```bash
33-
wget https://github.com/andreaskoch/gargantua/releases/download/v0.2.0-alpha/gargantua_linux_amd64
41+
wget https://github.com/andreaskoch/gargantua/releases/download/v0.3.0-alpha/gargantua_linux_amd64
3442
```
3543

3644
## Docker Image
@@ -52,10 +60,10 @@ docker run --rm andreaskoch/gargantua:latest \
5260
## Roadmap
5361

5462
- Increase the number of workers at runtime
55-
- Personalized user agent string
5663
- Silent mode (only show statistics at the end)
5764
- CSV mode (print CSV output to stdout)
5865
- Web-UI
66+
- Save downloaded data to disk
5967

6068
## License
6169

crawler.go

+5-4
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,13 @@ import (
88
type CrawlOptions struct {
99
NumberOfConcurrentRequests int
1010
Timeout time.Duration
11+
UserAgent string
1112
}
1213

1314
func crawl(xmlSitemapURL url.URL, options CrawlOptions, stop chan bool) error {
1415

1516
// read the XML sitemap as a initial source for URLs
16-
urlsFromXMLSitemap, err := getURLs(xmlSitemapURL)
17+
urlsFromXMLSitemap, err := getURLs(xmlSitemapURL, "gargantua bot")
1718
if err != nil {
1819
return err
1920
}
@@ -59,7 +60,7 @@ func crawl(xmlSitemapURL url.URL, options CrawlOptions, stop chan bool) error {
5960
go func() {
6061
workerID := <-workers
6162
debugf("Using worker %d for URL %q", workerID, targetURL.String())
62-
results <- executeWork(workerID, cap(workers), targetURL, urls)
63+
results <- executeWork(workerID, cap(workers), targetURL, options.UserAgent, urls)
6364
debugf("Worker %d finished processing URL %q", workerID, targetURL.String())
6465
workers <- workerID
6566
}()
@@ -85,8 +86,8 @@ func crawl(xmlSitemapURL url.URL, options CrawlOptions, stop chan bool) error {
8586
return
8687

8788
case result := <-results:
88-
url := result.URL()
89-
debugf("Received results for URL %q", url.String())
89+
receivedUrl := result.URL()
90+
debugf("Received results for URL %q", receivedUrl.String())
9091
updateStatistics(result)
9192
}
9293
}

go.mod

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
module github.com/andreaskoch/gargantua
2+
3+
go 1.14
4+
5+
require (
6+
github.com/PuerkitoBio/goquery v1.0.2
7+
github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc // indirect
8+
github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf // indirect
9+
github.com/andybalholm/cascadia v0.0.0-20161224141413-349dd0209470 // indirect
10+
github.com/gizak/termui v2.2.1-0.20170117222342-991cd3d38091+incompatible
11+
github.com/maruel/panicparse v0.0.0-20160720141634-ad661195ed0e // indirect
12+
github.com/mattn/go-runewidth v0.0.2-0.20161012013512-737072b4e32b // indirect
13+
github.com/mitchellh/go-wordwrap v0.0.0-20150314170334-ad45545899c7 // indirect
14+
github.com/nsf/termbox-go v0.0.0-20161205194251-abe82ce5fb7a // indirect
15+
golang.org/x/net v0.0.0-20161101191631-4bb47a1098b3 // indirect
16+
gopkg.in/alecthomas/kingpin.v2 v2.2.3
17+
)

go.sum

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
github.com/PuerkitoBio/goquery v1.0.2 h1:6eVgli+CgrpInQgyW5Unj3aqfzqFk/ALcKm6m0w7hgA=
2+
github.com/PuerkitoBio/goquery v1.0.2/go.mod h1:T9ezsOHcCrDCgA8aF1Cqr3sSYbO/xgdy8/R/XiIMAhA=
3+
github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc h1:cAKDfWh5VpdgMhJosfJnn5/FoN2SRZ4p7fJNX58YPaU=
4+
github.com/alecthomas/template v0.0.0-20160405071501-a0175ee3bccc/go.mod h1:LOuyumcjzFXgccqObfd/Ljyb9UuFJ6TxHnclSeseNhc=
5+
github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf h1:qet1QNfXsQxTZqLG4oE62mJzwPIB8+Tee4RNCL9ulrY=
6+
github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0=
7+
github.com/andybalholm/cascadia v0.0.0-20161224141413-349dd0209470 h1:4jHLmof+Hba81591gfH5xYA8QXzuvgksxwPNrmjR2BA=
8+
github.com/andybalholm/cascadia v0.0.0-20161224141413-349dd0209470/go.mod h1:3I+3V7B6gTBYfdpYgIG2ymALS9H+5VDKUl3lHH7ToM4=
9+
github.com/gizak/termui v2.2.1-0.20170117222342-991cd3d38091+incompatible h1:opetNB+OO9qymCnrSBGZPPKuQMMYBcyrzEYiOB+RrHM=
10+
github.com/gizak/termui v2.2.1-0.20170117222342-991cd3d38091+incompatible/go.mod h1:PkJoWUt/zacQKysNfQtcw1RW+eK2SxkieVBtl+4ovLA=
11+
github.com/maruel/panicparse v0.0.0-20160720141634-ad661195ed0e h1:e2z/lz9pvtRrEOgKWaLW2Dw02Nqd3/fqv0qWTQ8ByZE=
12+
github.com/maruel/panicparse v0.0.0-20160720141634-ad661195ed0e/go.mod h1:nty42YY5QByNC5MM7q/nj938VbgPU7avs45z6NClpxI=
13+
github.com/mattn/go-runewidth v0.0.2-0.20161012013512-737072b4e32b h1:zGKCShADxSzhD4RVcNFKuaedhqMYyWD54Fg9aV/BvXM=
14+
github.com/mattn/go-runewidth v0.0.2-0.20161012013512-737072b4e32b/go.mod h1:LwmH8dsx7+W8Uxz3IHJYH5QSwggIsqBzpuz5H//U1FU=
15+
github.com/mitchellh/go-wordwrap v0.0.0-20150314170334-ad45545899c7 h1:DpOJ2HYzCv8LZP15IdmG+YdwD2luVPHITV96TkirNBM=
16+
github.com/mitchellh/go-wordwrap v0.0.0-20150314170334-ad45545899c7/go.mod h1:ZXFpozHsX6DPmq2I0TCekCxypsnAUbP2oI0UX1GXzOo=
17+
github.com/nsf/termbox-go v0.0.0-20161205194251-abe82ce5fb7a h1:JbDkPy70t0IWlnEvNb5TsmOOpKp/0UkPk2FMek2mOGM=
18+
github.com/nsf/termbox-go v0.0.0-20161205194251-abe82ce5fb7a/go.mod h1:IuKpRQcYE1Tfu+oAQqaLisqDeXgjyyltCfsaoYN18NQ=
19+
golang.org/x/net v0.0.0-20161101191631-4bb47a1098b3 h1:9FrZULpPblLeSMxFmRapLbJGYHjcvaCZYD+5rwKQqZA=
20+
golang.org/x/net v0.0.0-20161101191631-4bb47a1098b3/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4=
21+
gopkg.in/alecthomas/kingpin.v2 v2.2.3 h1:/L3oK40poPRwke0Ipa6qqf8n+awu60Vl3DMe+3jLDt4=
22+
gopkg.in/alecthomas/kingpin.v2 v2.2.3/go.mod h1:FMv+mEhP44yOT+4EoQTLFTRgOQ1FBLkstjWtayDeSgw=

http.go

+19-10
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,18 @@ func (response *Response) IsHTML() bool {
4848
return strings.HasPrefix(response.contentType, "text/html")
4949
}
5050

51-
func readURL(url url.URL) (Response, error) {
51+
func readURL(url url.URL, userAgent string) (Response, error) {
5252
startTime := time.Now().UTC()
53-
resp, fetchErr := http.Get(url.String())
53+
54+
req, requestErr := http.NewRequest("GET", url.String(), nil)
55+
if requestErr != nil {
56+
return Response{}, requestErr
57+
}
58+
59+
req.Header.Set("User-Agent", userAgent)
60+
61+
client := &http.Client{}
62+
resp, fetchErr := client.Do(req)
5463
if fetchErr != nil {
5564
return Response{}, fetchErr
5665
}
@@ -78,16 +87,16 @@ func readURL(url url.URL) (Response, error) {
7887
}, nil
7988
}
8089

81-
func getURLs(xmlSitemapURL url.URL) ([]url.URL, error) {
90+
func getURLs(xmlSitemapURL url.URL, userAgent string) ([]url.URL, error) {
8291

8392
var urls []url.URL
8493

85-
urlsFromIndex, indexError := getURLsFromSitemapIndex(xmlSitemapURL)
94+
urlsFromIndex, indexError := getURLsFromSitemapIndex(xmlSitemapURL, userAgent)
8695
if indexError == nil {
8796
urls = urlsFromIndex
8897
}
8998

90-
urlsFromSitemap, sitemapError := getURLsFromSitemap(xmlSitemapURL)
99+
urlsFromSitemap, sitemapError := getURLsFromSitemap(xmlSitemapURL, userAgent)
91100
if sitemapError == nil {
92101
urls = append(urls, urlsFromSitemap...)
93102
}
@@ -100,11 +109,11 @@ func getURLs(xmlSitemapURL url.URL) ([]url.URL, error) {
100109

101110
}
102111

103-
func getURLsFromSitemap(xmlSitemapURL url.URL) ([]url.URL, error) {
112+
func getURLsFromSitemap(xmlSitemapURL url.URL, userAgent string) ([]url.URL, error) {
104113

105114
var urls []url.URL
106115

107-
sitemap, xmlSitemapError := getXMLSitemap(xmlSitemapURL)
116+
sitemap, xmlSitemapError := getXMLSitemap(xmlSitemapURL, userAgent)
108117
if xmlSitemapError != nil {
109118
return nil, xmlSitemapError
110119
}
@@ -122,11 +131,11 @@ func getURLsFromSitemap(xmlSitemapURL url.URL) ([]url.URL, error) {
122131
return urls, nil
123132
}
124133

125-
func getURLsFromSitemapIndex(xmlSitemapURL url.URL) ([]url.URL, error) {
134+
func getURLsFromSitemapIndex(xmlSitemapURL url.URL, userAgent string) ([]url.URL, error) {
126135

127136
var urls []url.URL
128137

129-
sitemapIndex, sitemapIndexError := getSitemapIndex(xmlSitemapURL)
138+
sitemapIndex, sitemapIndexError := getSitemapIndex(xmlSitemapURL, userAgent)
130139
if sitemapIndexError != nil {
131140
return nil, sitemapIndexError
132141
}
@@ -138,7 +147,7 @@ func getURLsFromSitemapIndex(xmlSitemapURL url.URL) ([]url.URL, error) {
138147
return nil, err
139148
}
140149

141-
sitemapUrls, err := getURLsFromSitemap(*locationURL)
150+
sitemapUrls, err := getURLsFromSitemap(*locationURL, userAgent)
142151
if err != nil {
143152
return nil, err
144153
}

main.go

+7-3
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,9 @@ import (
1111
)
1212

1313
const applicationName = "gargantua"
14-
const applicationVersion = "v0.2.0-alpha"
14+
const applicationVersion = "v0.3.0-alpha"
15+
16+
var defaultUserAgent = fmt.Sprintf("%s bot (https://github.com/andreaskoch/gargantua)", applicationName)
1517

1618
var (
1719
app = kingpin.New(applicationName, fmt.Sprintf(`「 %s 」%s crawls all URLs of your website - starting with the links in your sitemap.xml
@@ -27,6 +29,7 @@ var (
2729
crawlCommand = app.Command("crawl", "Crawls a given websites' XML sitemap")
2830
crawlWebsiteURL = crawlCommand.Flag("url", "The URL to a websites' XML sitemap (e.g. https://www.sitemaps.org/sitemap.xml)").Required().Envar("GARGANTUA_URL").Short('u').String()
2931
crawlWorkers = crawlCommand.Flag("workers", "The number of concurrent workers that crawl the site at the same time").Required().Envar("GARGANTUA_WORKERS").Short('w').Int()
32+
userAgent = crawlCommand.Flag("user-agent", "The user agent that shall be used for all requests").Default(defaultUserAgent).Envar("GARGANTUA_USER_AGENT").Short('a').String()
3033
)
3134

3235
func init() {
@@ -49,7 +52,7 @@ func handleCommandlineArgument(arguments []string) {
4952
os.Exit(1)
5053
}
5154

52-
err := startCrawling(*websiteURL, *crawlWorkers, *timeout, *verbose)
55+
err := startCrawling(*websiteURL, *userAgent, *crawlWorkers, *timeout, *verbose)
5356
if err != nil {
5457
fmt.Fprintf(os.Stderr, "%s", err)
5558
os.Exit(1)
@@ -60,7 +63,7 @@ func handleCommandlineArgument(arguments []string) {
6063

6164
}
6265

63-
func startCrawling(targetURL url.URL, concurrentRequests, timeoutInSeconds int, debugModeIsEnabled bool) error {
66+
func startCrawling(targetURL url.URL, userAgent string, concurrentRequests, timeoutInSeconds int, debugModeIsEnabled bool) error {
6467
stopTheCrawler := make(chan bool)
6568
stopTheUI := make(chan bool)
6669
crawlResult := make(chan error)
@@ -69,6 +72,7 @@ func startCrawling(targetURL url.URL, concurrentRequests, timeoutInSeconds int,
6972
result := crawl(targetURL, CrawlOptions{
7073
NumberOfConcurrentRequests: int(concurrentRequests),
7174
Timeout: time.Second * time.Duration(timeoutInSeconds),
75+
UserAgent: userAgent,
7276
}, stopTheCrawler)
7377

7478
stopTheUI <- true

sitemapindex.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@ import (
66
"strings"
77
)
88

9-
func getSitemapIndex(xmlSitemapURL url.URL) (SitemapIndex, error) {
10-
response, readErr := readURL(xmlSitemapURL)
9+
func getSitemapIndex(xmlSitemapURL url.URL, userAgent string) (SitemapIndex, error) {
10+
response, readErr := readURL(xmlSitemapURL, userAgent)
1111
if readErr != nil {
1212
return SitemapIndex{}, readErr
1313
}

sitemapindex_test.go

+2-2
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ func Test_getSitemapIndex_NoIndexGiven_ErrorIsReturned(t *testing.T) {
2020
defer testSitemapServer.Close()
2121

2222
testServerURL, _ := url.Parse(testSitemapServer.URL)
23-
_, err := getSitemapIndex(*testServerURL)
23+
_, err := getSitemapIndex(*testServerURL, "gargantua bot")
2424

2525
if err == nil {
2626
t.Fail()
@@ -47,7 +47,7 @@ func Test_getSitemapIndex_IndexExists_IndexIsNotEmpty(t *testing.T) {
4747
defer testSitemapServer.Close()
4848

4949
testServerURL, _ := url.Parse(testSitemapServer.URL)
50-
sitemapIndex, err := getSitemapIndex(*testServerURL)
50+
sitemapIndex, err := getSitemapIndex(*testServerURL, "gargantua bot")
5151

5252
if err != nil {
5353
t.Fail()

vendor/github.com/PuerkitoBio/goquery/.gitattributes

+1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vendor/github.com/PuerkitoBio/goquery/.gitignore

+16
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vendor/github.com/PuerkitoBio/goquery/.travis.yml

+11
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vendor/github.com/andybalholm/cascadia/.travis.yml

+14
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vendor/github.com/andybalholm/cascadia/LICENSE

100755100644
File mode changed.

vendor/github.com/gizak/termui/.gitignore

+26
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

vendor/github.com/gizak/termui/.travis.yml

+6
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)