Skip to content

Commit a3e9041

Browse files
committed
Merge branch 'release/v0.4.0-alpha'
2 parents 2c56094 + 183bf6b commit a3e9041

File tree

155 files changed

+818
-30565
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

155 files changed

+818
-30565
lines changed

.github/workflows/go.yml

+26
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
name: Go
2+
3+
on: [push]
4+
5+
jobs:
6+
7+
build:
8+
name: Build
9+
runs-on: ubuntu-latest
10+
steps:
11+
12+
- name: Set up Go 1.x
13+
uses: actions/setup-go@v2
14+
with:
15+
go-version: ^1
16+
17+
- name: Check out code into the Go module directory
18+
uses: actions/checkout@v2
19+
20+
- name: Build
21+
run: |
22+
go mod tidy
23+
go build -v .
24+
25+
- name: Test
26+
run: go test -v .

.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -1 +1,3 @@
1+
.DS_Store
12
/.idea
3+
/vendor

CHANGELOG.md

+10-1
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,19 @@ All notable changes to this project will be documented in this file.
44
The format is based on [Keep a Changelog](http://keepachangelog.com/)
55
and this project adheres to [Semantic Versioning](http://semver.org/).
66

7-
## [Unreleased]
7+
## [v0.4.0-alpha] - 2020-11-05
8+
9+
Logging
810

911
### Added
1012
- Add "Save downloaded data to disk" to the roadmap (feature request #1)
13+
- Log results to a log file
14+
- Add Github actions
15+
16+
### Changed
17+
- Add support for non-sitemap URLs
18+
- Capture the parent URL
19+
- Switch from go 1.14 to 1.15
1120

1221
## [v0.2.0-alpha] - 2017-02-07
1322

README.md

+24
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,30 @@ You can specify a customized user agent using the `--user-agent` argument:
3333
gargantua crawl --url https://www.sitemaps.org/sitemap.xml --workers 5 --user-agent "gargantua bot / iPhone"
3434
```
3535

36+
### Log all requests
37+
38+
You can specify a log file with the `--log` argument:
39+
40+
```bash
41+
gargantua crawl --url https://www.sitemaps.org/sitemap.xml --workers 5 --log "gargantua.log"
42+
```
43+
44+
```
45+
Date and time #worker Status Code Bytes Response Time URL Parent URL
46+
2020/11/05 09:23:14 #001: 200 4403 148.759000ms https://www.sitemaps.org https://www.sitemaps.org/ko/faq.html
47+
2020/11/05 09:23:14 #002: 200 4403 290.536000ms http://www.sitemaps.org/ https://www.sitemaps.org/ko/faq.html
48+
2020/11/05 09:23:14 #003: 200 45077 283.243000ms https://www.sitemaps.org/protocol.html https://www.sitemaps.org/ko/faq.html
49+
2020/11/05 09:23:14 #004: 404 1245 155.376000ms https://www.sitemaps.org/protocol.htm https://www.sitemaps.org/ko/faq.html
50+
2020/11/05 09:23:14 #005: 200 4403 155.577000ms https://www.sitemaps.org/index.html https://www.sitemaps.org/ko/faq.html
51+
2020/11/05 09:23:14 #001: 200 2591 286.451000ms http://www.sitemaps.org/schemas/sitemap/0.9/siteindex.xsd https://www.sitemaps.org/ko/faq.html
52+
2020/11/05 09:23:14 #003: 200 10839 143.738000ms https://www.sitemaps.org/terms.html https://www.sitemaps.org/ko/faq.html
53+
2020/11/05 09:23:14 #005: 200 15681 141.580000ms https://www.sitemaps.org/faq.html https://www.sitemaps.org/ko/protocol.html
54+
2020/11/05 09:23:14 #002: 404 1245 286.175000ms http://www.sitemaps.org/protocol.htm https://www.sitemaps.org/ko/faq.html
55+
```
56+
57+
[gargantua.log](files/gargantua.log)
58+
59+
3660
## Download
3761

3862
You can download binaries for Linux, macOS and Windows from [github.com »andreaskoch » gargantua » releases](https://github.com/andreaskoch/gargantua/releases):

crawler.go

+21-2
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,18 @@
11
package main
22

33
import (
4+
"github.com/pkg/errors"
5+
"log"
46
"net/url"
7+
"os"
58
"time"
69
)
710

811
type CrawlOptions struct {
912
NumberOfConcurrentRequests int
1013
Timeout time.Duration
1114
UserAgent string
15+
LogFile string
1216
}
1317

1418
func crawl(xmlSitemapURL url.URL, options CrawlOptions, stop chan bool) error {
@@ -20,7 +24,7 @@ func crawl(xmlSitemapURL url.URL, options CrawlOptions, stop chan bool) error {
2024
}
2125

2226
// the URL queue
23-
urls := make(chan url.URL, len(urlsFromXMLSitemap))
27+
urls := make(chan crawlerUrl, len(urlsFromXMLSitemap))
2428

2529
// fill the URL queue with the URLs from the XML sitemap
2630
for _, xmlSitemapURLEntry := range urlsFromXMLSitemap {
@@ -37,7 +41,7 @@ func crawl(xmlSitemapURL url.URL, options CrawlOptions, stop chan bool) error {
3741

3842
allURLsHaveBeenVisited := make(chan bool)
3943
go func() {
40-
var visitedURLs = make(map[string]url.URL)
44+
var visitedURLs = make(map[string]crawlerUrl)
4145
for {
4246
select {
4347
case <-stop:
@@ -76,6 +80,17 @@ func crawl(xmlSitemapURL url.URL, options CrawlOptions, stop chan bool) error {
7680
}
7781
}()
7882

83+
var logger *log.Logger
84+
if options.LogFile != "" {
85+
file, err := os.OpenFile(options.LogFile, os.O_APPEND|os.O_CREATE|os.O_WRONLY, 0666)
86+
if err != nil {
87+
return errors.Wrapf(err, "failed to open log file %q for writing", options.LogFile)
88+
}
89+
90+
defer file.Close()
91+
logger = log.New(file, "", log.Ldate|log.Ltime)
92+
}
93+
7994
// update the statistics with the results
8095
allStatisticsHaveBeenUpdated := make(chan bool)
8196
go func() {
@@ -89,6 +104,10 @@ func crawl(xmlSitemapURL url.URL, options CrawlOptions, stop chan bool) error {
89104
receivedUrl := result.URL()
90105
debugf("Received results for URL %q", receivedUrl.String())
91106
updateStatistics(result)
107+
108+
if logger != nil {
109+
logResult(logger, result)
110+
}
92111
}
93112
}
94113
}()

0 commit comments

Comments
 (0)