Skip to content

Commit 63f5310

Browse files
authored
Merge pull request #33 from planetlabs/filter
Optional filter for crawler
2 parents 8d5c7a1 + f3ca628 commit 63f5310

File tree

2 files changed

+88
-0
lines changed

2 files changed

+88
-0
lines changed

crawler/crawler.go

+13
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ type Crawler struct {
104104
visitor Visitor
105105
recursion RecursionType
106106
concurrency int
107+
filter func(string) bool
107108
queue workgroup.Queue[*Task]
108109
}
109110

@@ -116,6 +117,12 @@ type Options struct {
116117
// a single resource. Use Children to only visit linked item/child resources.
117118
Recursion RecursionType
118119

120+
// Optional function to limit which resources to crawl. If provided, the function
121+
// will be called with the URL or absolute path to a resource before it is crawled.
122+
// If the function returns false, the resource will not be read and the visitor will
123+
// not be called.
124+
Filter func(string) bool
125+
119126
Queue workgroup.Queue[*Task]
120127
}
121128

@@ -129,6 +136,9 @@ func (c *Crawler) apply(options *Options) {
129136
if options.Queue != nil {
130137
c.queue = options.Queue
131138
}
139+
if options.Filter != nil {
140+
c.filter = options.Filter
141+
}
132142
}
133143

134144
// DefaultOptions used when creating a new crawler.
@@ -211,6 +221,9 @@ func (c *Crawler) load(loc *normurl.Locator, value interface{}) error {
211221
}
212222

213223
func (c *Crawler) crawl(worker *workgroup.Worker[*Task], t *Task) error {
224+
if c.filter != nil && !c.filter(t.Url) {
225+
return nil
226+
}
214227
switch t.Type {
215228
case resourceTask:
216229
return c.crawlResource(worker, t.Url)

crawler/crawler_test.go

+75
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,81 @@ func TestCrawler(t *testing.T) {
3535
assert.NoError(t, err)
3636

3737
assert.Equal(t, uint64(3), count)
38+
39+
wd, wdErr := os.Getwd()
40+
require.NoError(t, wdErr)
41+
42+
_, visitedCatalog := visited.Load(filepath.Join(wd, "testdata/v1.0.0/catalog-with-collection-of-items.json"))
43+
assert.True(t, visitedCatalog)
44+
45+
_, visitedCollection := visited.Load(filepath.Join(wd, "testdata/v1.0.0/collection-with-items.json"))
46+
assert.True(t, visitedCollection)
47+
48+
_, visitedItem := visited.Load(filepath.Join(wd, "testdata/v1.0.0/item-in-collection.json"))
49+
assert.True(t, visitedItem)
50+
}
51+
52+
func TestCrawlerFilterItem(t *testing.T) {
53+
count := uint64(0)
54+
visited := &sync.Map{}
55+
56+
visitor := func(location string, resource crawler.Resource) error {
57+
atomic.AddUint64(&count, 1)
58+
_, loaded := visited.LoadOrStore(location, true)
59+
if loaded {
60+
return fmt.Errorf("already visited %s", location)
61+
}
62+
return nil
63+
}
64+
c := crawler.New(visitor, &crawler.Options{
65+
Filter: func(location string) bool {
66+
return !strings.HasSuffix(location, "/item-in-collection.json")
67+
},
68+
})
69+
70+
err := c.Crawl(context.Background(), "testdata/v1.0.0/catalog-with-collection-of-items.json")
71+
assert.NoError(t, err)
72+
73+
assert.Equal(t, uint64(2), count)
74+
75+
wd, wdErr := os.Getwd()
76+
require.NoError(t, wdErr)
77+
78+
_, visitedCatalog := visited.Load(filepath.Join(wd, "testdata/v1.0.0/catalog-with-collection-of-items.json"))
79+
assert.True(t, visitedCatalog)
80+
81+
_, visitedCollection := visited.Load(filepath.Join(wd, "testdata/v1.0.0/collection-with-items.json"))
82+
assert.True(t, visitedCollection)
83+
}
84+
85+
func TestCrawlerFilterCollection(t *testing.T) {
86+
count := uint64(0)
87+
visited := &sync.Map{}
88+
89+
visitor := func(location string, resource crawler.Resource) error {
90+
atomic.AddUint64(&count, 1)
91+
_, loaded := visited.LoadOrStore(location, true)
92+
if loaded {
93+
return fmt.Errorf("already visited %s", location)
94+
}
95+
return nil
96+
}
97+
c := crawler.New(visitor, &crawler.Options{
98+
Filter: func(location string) bool {
99+
return !strings.HasSuffix(location, "/collection-with-items.json")
100+
},
101+
})
102+
103+
err := c.Crawl(context.Background(), "testdata/v1.0.0/catalog-with-collection-of-items.json")
104+
assert.NoError(t, err)
105+
106+
assert.Equal(t, uint64(1), count)
107+
108+
wd, wdErr := os.Getwd()
109+
require.NoError(t, wdErr)
110+
111+
_, visitedCatalog := visited.Load(filepath.Join(wd, "testdata/v1.0.0/catalog-with-collection-of-items.json"))
112+
assert.True(t, visitedCatalog)
38113
}
39114

40115
func TestCrawlerHTTP(t *testing.T) {

0 commit comments

Comments
 (0)