Skip to content

Commit 5f60913

Browse files
author
Massimiliano Giovagnoli
committed
new(cmd): set wildcard as default name for directories and files.
Moreover, add unit tests for shallow finds. Signed-off-by: Massimiliano Giovagnoli <me@maxgio.it>
1 parent e78e511 commit 5f60913

File tree

5 files changed

+277
-268
lines changed

5 files changed

+277
-268
lines changed

cmd/find/find.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ func NewCmd() *cobra.Command {
4545

4646
var filename string
4747

48-
cmd.Flags().StringVarP(&filename, "name", "n", "", "Base of file name (the path with the leading directories removed) exact pattern.")
48+
cmd.Flags().StringVarP(&filename, "name", "n", ".+", "Base of file name (the path with the leading directories removed) exact pattern.")
4949

5050
// As of now only exact glob pattern expressions are allowed. The expression then translated to an exact-match regular expression.
5151
o.FilenameRegexp = fmt.Sprintf("^%s$", filename)

pkg/find/file.go

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
package find
2+
3+
import (
4+
"fmt"
5+
"github.com/gocolly/colly"
6+
d "github.com/gocolly/colly/debug"
7+
"github.com/pkg/errors"
8+
"net/url"
9+
"path"
10+
"regexp"
11+
"strings"
12+
)
13+
14+
// crawlFiles returns a list of file names found from the seed URL, filtered by file name regex.
15+
//
16+
//nolint:funlen,cyclop
17+
func (o *Options) crawlFiles() (*Result, error) {
18+
seeds := []*url.URL{}
19+
20+
err := o.Validate()
21+
if err != nil {
22+
return nil, err
23+
}
24+
25+
for _, v := range o.SeedURLs {
26+
u, _ := url.Parse(v)
27+
28+
seeds = append(seeds, u)
29+
}
30+
31+
var files, urls []string
32+
33+
folderPattern := regexp.MustCompile(folderRegex)
34+
35+
exactFilePattern := regexp.MustCompile(o.FilenameRegexp)
36+
37+
fileRegex := strings.TrimPrefix(o.FilenameRegexp, "^")
38+
filePattern := regexp.MustCompile(fileRegex)
39+
40+
allowedDomains := getHostnamesFromURLs(seeds)
41+
42+
// Create the collector settings
43+
coOptions := []func(*colly.Collector){
44+
colly.AllowedDomains(allowedDomains...),
45+
colly.Async(false),
46+
}
47+
48+
if o.Verbose {
49+
coOptions = append(coOptions, colly.Debugger(&d.LogDebugger{}))
50+
}
51+
52+
// Create the collector.
53+
co := colly.NewCollector(coOptions...)
54+
55+
// Add the callback to Visit the linked resource, for each HTML element found
56+
co.OnHTML(HTMLTagLink, func(e *colly.HTMLElement) {
57+
href := e.Attr(HTMLAttrRef)
58+
59+
folderMatch := folderPattern.FindStringSubmatch(href)
60+
61+
u, _ := url.JoinPath(e.Request.URL.String(), href)
62+
63+
// If the URL is not of a folder.
64+
if len(folderMatch) == 0 {
65+
fileMatch := filePattern.FindStringSubmatch(href)
66+
67+
// If the URL is of a file.
68+
if len(fileMatch) > 0 {
69+
fileName := path.Base(href)
70+
fileNameMatch := exactFilePattern.FindStringSubmatch(fileName)
71+
72+
// If the URL matches the file filter regex.
73+
if len(fileNameMatch) > 0 {
74+
files = append(files, fileName)
75+
urls = append(urls, u)
76+
}
77+
}
78+
}
79+
80+
// Do not traverse the hierarchy in reverse order.
81+
if o.Recursive && !(strings.Contains(href, UpDir)) && href != RootDir {
82+
//nolint:errcheck
83+
co.Visit(e.Request.AbsoluteURL(href))
84+
}
85+
})
86+
87+
// Visit each root folder.
88+
for _, seedURL := range seeds {
89+
err := co.Visit(seedURL.String())
90+
if err != nil {
91+
return nil, errors.Wrap(err, fmt.Sprintf("error scraping file with URL %seedURLs", seedURL.String()))
92+
}
93+
}
94+
95+
return &Result{BaseNames: files, URLs: urls}, nil
96+
}

pkg/find/find.go

Lines changed: 1 addition & 184 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,10 @@
11
package find
22

33
import (
4-
"fmt"
54
"net/url"
6-
"path"
75
"regexp"
86
"strings"
97

10-
"github.com/gocolly/colly"
11-
d "github.com/gocolly/colly/debug"
128
"github.com/pkg/errors"
139
)
1410

@@ -122,7 +118,7 @@ func (o *Options) Validate() error {
122118
}
123119

124120
func (o *Options) sanitize() {
125-
if strings.HasPrefix(o.FilenameRegexp, "^") && !strings.HasPrefix(o.FilenameRegexp, "^./") {
121+
if strings.HasPrefix(o.FilenameRegexp, "^") && !strings.HasPrefix(o.FilenameRegexp, "^./") && !strings.HasPrefix(o.FilenameRegexp, `^(\./)?`) {
126122
o.FilenameRegexp = strings.Replace(o.FilenameRegexp, "^", `^(\./)?`, 1)
127123
}
128124

@@ -148,182 +144,3 @@ func (o *Options) Find() (*Result, error) {
148144
return o.crawlFiles()
149145
}
150146
}
151-
152-
// crawlFiles returns a list of file names found from the seed URL, filtered by file name regex.
153-
//
154-
//nolint:funlen,cyclop
155-
func (o *Options) crawlFiles() (*Result, error) {
156-
seeds := []*url.URL{}
157-
158-
err := o.Validate()
159-
if err != nil {
160-
return nil, err
161-
}
162-
163-
for _, v := range o.SeedURLs {
164-
u, _ := url.Parse(v)
165-
166-
seeds = append(seeds, u)
167-
}
168-
169-
var files, urls []string
170-
171-
folderPattern := regexp.MustCompile(folderRegex)
172-
173-
exactFilePattern := regexp.MustCompile(o.FilenameRegexp)
174-
175-
fileRegex := strings.TrimPrefix(o.FilenameRegexp, "^")
176-
filePattern := regexp.MustCompile(fileRegex)
177-
178-
allowedDomains := getHostnamesFromURLs(seeds)
179-
180-
// Create the collector settings
181-
coOptions := []func(*colly.Collector){
182-
colly.AllowedDomains(allowedDomains...),
183-
colly.Async(false),
184-
}
185-
186-
if o.Verbose {
187-
coOptions = append(coOptions, colly.Debugger(&d.LogDebugger{}))
188-
}
189-
190-
// Create the collector.
191-
co := colly.NewCollector(coOptions...)
192-
193-
// Add the callback to Visit the linked resource, for each HTML element found
194-
co.OnHTML(HTMLTagLink, func(e *colly.HTMLElement) {
195-
link := e.Attr(HTMLAttrRef)
196-
197-
// Do not traverse the hierarchy in reverse order.
198-
if o.Recursive && !(strings.Contains(link, UpDir)) && link != RootDir {
199-
//nolint:errcheck
200-
co.Visit(e.Request.AbsoluteURL(link))
201-
}
202-
})
203-
204-
// Add the analysis callback to find file URLs, for each Visit call
205-
co.OnRequest(func(r *colly.Request) {
206-
folderMatch := folderPattern.FindStringSubmatch(r.URL.String())
207-
208-
// If the URL is not of a folder.
209-
if len(folderMatch) == 0 {
210-
fileMatch := filePattern.FindStringSubmatch(r.URL.String())
211-
212-
// If the URL is of a file.
213-
if len(fileMatch) > 0 {
214-
fileName := path.Base(r.URL.String())
215-
fileNameMatch := exactFilePattern.FindStringSubmatch(fileName)
216-
217-
// If the URL matches the file filter regex.
218-
if len(fileNameMatch) > 0 {
219-
files = append(files, fileName)
220-
urls = append(urls, r.URL.String())
221-
}
222-
}
223-
// Otherwise abort the request.
224-
r.Abort()
225-
}
226-
})
227-
228-
// Visit each root folder.
229-
for _, seedURL := range seeds {
230-
err := co.Visit(seedURL.String())
231-
if err != nil {
232-
return nil, errors.Wrap(err, fmt.Sprintf("error scraping file with URL %seedURLs", seedURL.String()))
233-
}
234-
}
235-
236-
return &Result{BaseNames: files, URLs: urls}, nil
237-
}
238-
239-
// crawlFolders returns a list of folder names found from each seed URL, filtered by folder name regex.
240-
//
241-
//nolint:funlen,cyclop
242-
func (o *Options) crawlFolders() (*Result, error) {
243-
seeds := []*url.URL{}
244-
245-
err := o.Validate()
246-
if err != nil {
247-
return nil, err
248-
}
249-
250-
for _, v := range o.SeedURLs {
251-
u, _ := url.Parse(v)
252-
253-
seeds = append(seeds, u)
254-
}
255-
256-
var folders, urls []string
257-
258-
folderPattern := regexp.MustCompile(folderRegex)
259-
260-
exactFolderPattern := regexp.MustCompile(o.FilenameRegexp)
261-
262-
allowedDomains := getHostnamesFromURLs(seeds)
263-
if len(allowedDomains) < 1 {
264-
//nolint:goerr113
265-
return nil, fmt.Errorf("invalid seed urls")
266-
}
267-
268-
// Create the collector settings
269-
coOptions := []func(*colly.Collector){
270-
colly.AllowedDomains(allowedDomains...),
271-
colly.Async(false),
272-
}
273-
274-
if o.Verbose {
275-
coOptions = append(coOptions, colly.Debugger(&d.LogDebugger{}))
276-
}
277-
278-
// Create the collector.
279-
co := colly.NewCollector(coOptions...)
280-
281-
// Visit each specific folder.
282-
co.OnHTML(HTMLTagLink, func(e *colly.HTMLElement) {
283-
href := e.Attr(HTMLAttrRef)
284-
285-
folderMatch := folderPattern.FindStringSubmatch(href)
286-
287-
// if the URL is of a folder.
288-
//nolint:nestif
289-
if len(folderMatch) > 0 {
290-
// Do not traverse the hierarchy in reverse order.
291-
if strings.Contains(href, UpDir) || href == RootDir {
292-
return
293-
}
294-
295-
exactFolderMatch := exactFolderPattern.FindStringSubmatch(href)
296-
if len(exactFolderMatch) > 0 {
297-
hrefAbsURL, _ := url.Parse(e.Request.AbsoluteURL(href))
298-
299-
if !urlSliceContains(seeds, hrefAbsURL) {
300-
folders = append(folders, path.Base(hrefAbsURL.Path))
301-
urls = append(urls, hrefAbsURL.String())
302-
}
303-
}
304-
if o.Recursive {
305-
//nolint:errcheck
306-
co.Visit(e.Request.AbsoluteURL(href))
307-
}
308-
}
309-
})
310-
311-
co.OnRequest(func(r *colly.Request) {
312-
folderMatch := folderPattern.FindStringSubmatch(r.URL.String())
313-
314-
// if the URL is not of a folder.
315-
if len(folderMatch) == 0 {
316-
r.Abort()
317-
}
318-
})
319-
320-
// Visit each root folder.
321-
for _, seedURL := range seeds {
322-
err := co.Visit(seedURL.String())
323-
if err != nil {
324-
return nil, errors.Wrap(err, fmt.Sprintf("error scraping folder with URL %seedURLs", seedURL.String()))
325-
}
326-
}
327-
328-
return &Result{BaseNames: folders, URLs: urls}, nil
329-
}

0 commit comments

Comments
 (0)