11package find
22
33import (
4- "fmt"
54 "net/url"
6- "path"
75 "regexp"
86 "strings"
97
10- "github.com/gocolly/colly"
11- d "github.com/gocolly/colly/debug"
128 "github.com/pkg/errors"
139)
1410
@@ -122,7 +118,7 @@ func (o *Options) Validate() error {
122118}
123119
124120func (o * Options ) sanitize () {
125- if strings .HasPrefix (o .FilenameRegexp , "^" ) && ! strings .HasPrefix (o .FilenameRegexp , "^./" ) {
121+ if strings .HasPrefix (o .FilenameRegexp , "^" ) && ! strings .HasPrefix (o .FilenameRegexp , "^./" ) && ! strings . HasPrefix ( o . FilenameRegexp , `^(\./)?` ) {
126122 o .FilenameRegexp = strings .Replace (o .FilenameRegexp , "^" , `^(\./)?` , 1 )
127123 }
128124
@@ -148,182 +144,3 @@ func (o *Options) Find() (*Result, error) {
148144 return o .crawlFiles ()
149145 }
150146}
151-
152- // crawlFiles returns a list of file names found from the seed URL, filtered by file name regex.
153- //
154- //nolint:funlen,cyclop
155- func (o * Options ) crawlFiles () (* Result , error ) {
156- seeds := []* url.URL {}
157-
158- err := o .Validate ()
159- if err != nil {
160- return nil , err
161- }
162-
163- for _ , v := range o .SeedURLs {
164- u , _ := url .Parse (v )
165-
166- seeds = append (seeds , u )
167- }
168-
169- var files , urls []string
170-
171- folderPattern := regexp .MustCompile (folderRegex )
172-
173- exactFilePattern := regexp .MustCompile (o .FilenameRegexp )
174-
175- fileRegex := strings .TrimPrefix (o .FilenameRegexp , "^" )
176- filePattern := regexp .MustCompile (fileRegex )
177-
178- allowedDomains := getHostnamesFromURLs (seeds )
179-
180- // Create the collector settings
181- coOptions := []func (* colly.Collector ){
182- colly .AllowedDomains (allowedDomains ... ),
183- colly .Async (false ),
184- }
185-
186- if o .Verbose {
187- coOptions = append (coOptions , colly .Debugger (& d.LogDebugger {}))
188- }
189-
190- // Create the collector.
191- co := colly .NewCollector (coOptions ... )
192-
193- // Add the callback to Visit the linked resource, for each HTML element found
194- co .OnHTML (HTMLTagLink , func (e * colly.HTMLElement ) {
195- link := e .Attr (HTMLAttrRef )
196-
197- // Do not traverse the hierarchy in reverse order.
198- if o .Recursive && ! (strings .Contains (link , UpDir )) && link != RootDir {
199- //nolint:errcheck
200- co .Visit (e .Request .AbsoluteURL (link ))
201- }
202- })
203-
204- // Add the analysis callback to find file URLs, for each Visit call
205- co .OnRequest (func (r * colly.Request ) {
206- folderMatch := folderPattern .FindStringSubmatch (r .URL .String ())
207-
208- // If the URL is not of a folder.
209- if len (folderMatch ) == 0 {
210- fileMatch := filePattern .FindStringSubmatch (r .URL .String ())
211-
212- // If the URL is of a file.
213- if len (fileMatch ) > 0 {
214- fileName := path .Base (r .URL .String ())
215- fileNameMatch := exactFilePattern .FindStringSubmatch (fileName )
216-
217- // If the URL matches the file filter regex.
218- if len (fileNameMatch ) > 0 {
219- files = append (files , fileName )
220- urls = append (urls , r .URL .String ())
221- }
222- }
223- // Otherwise abort the request.
224- r .Abort ()
225- }
226- })
227-
228- // Visit each root folder.
229- for _ , seedURL := range seeds {
230- err := co .Visit (seedURL .String ())
231- if err != nil {
232- return nil , errors .Wrap (err , fmt .Sprintf ("error scraping file with URL %seedURLs" , seedURL .String ()))
233- }
234- }
235-
236- return & Result {BaseNames : files , URLs : urls }, nil
237- }
238-
239- // crawlFolders returns a list of folder names found from each seed URL, filtered by folder name regex.
240- //
241- //nolint:funlen,cyclop
242- func (o * Options ) crawlFolders () (* Result , error ) {
243- seeds := []* url.URL {}
244-
245- err := o .Validate ()
246- if err != nil {
247- return nil , err
248- }
249-
250- for _ , v := range o .SeedURLs {
251- u , _ := url .Parse (v )
252-
253- seeds = append (seeds , u )
254- }
255-
256- var folders , urls []string
257-
258- folderPattern := regexp .MustCompile (folderRegex )
259-
260- exactFolderPattern := regexp .MustCompile (o .FilenameRegexp )
261-
262- allowedDomains := getHostnamesFromURLs (seeds )
263- if len (allowedDomains ) < 1 {
264- //nolint:goerr113
265- return nil , fmt .Errorf ("invalid seed urls" )
266- }
267-
268- // Create the collector settings
269- coOptions := []func (* colly.Collector ){
270- colly .AllowedDomains (allowedDomains ... ),
271- colly .Async (false ),
272- }
273-
274- if o .Verbose {
275- coOptions = append (coOptions , colly .Debugger (& d.LogDebugger {}))
276- }
277-
278- // Create the collector.
279- co := colly .NewCollector (coOptions ... )
280-
281- // Visit each specific folder.
282- co .OnHTML (HTMLTagLink , func (e * colly.HTMLElement ) {
283- href := e .Attr (HTMLAttrRef )
284-
285- folderMatch := folderPattern .FindStringSubmatch (href )
286-
287- // if the URL is of a folder.
288- //nolint:nestif
289- if len (folderMatch ) > 0 {
290- // Do not traverse the hierarchy in reverse order.
291- if strings .Contains (href , UpDir ) || href == RootDir {
292- return
293- }
294-
295- exactFolderMatch := exactFolderPattern .FindStringSubmatch (href )
296- if len (exactFolderMatch ) > 0 {
297- hrefAbsURL , _ := url .Parse (e .Request .AbsoluteURL (href ))
298-
299- if ! urlSliceContains (seeds , hrefAbsURL ) {
300- folders = append (folders , path .Base (hrefAbsURL .Path ))
301- urls = append (urls , hrefAbsURL .String ())
302- }
303- }
304- if o .Recursive {
305- //nolint:errcheck
306- co .Visit (e .Request .AbsoluteURL (href ))
307- }
308- }
309- })
310-
311- co .OnRequest (func (r * colly.Request ) {
312- folderMatch := folderPattern .FindStringSubmatch (r .URL .String ())
313-
314- // if the URL is not of a folder.
315- if len (folderMatch ) == 0 {
316- r .Abort ()
317- }
318- })
319-
320- // Visit each root folder.
321- for _ , seedURL := range seeds {
322- err := co .Visit (seedURL .String ())
323- if err != nil {
324- return nil , errors .Wrap (err , fmt .Sprintf ("error scraping folder with URL %seedURLs" , seedURL .String ()))
325- }
326- }
327-
328- return & Result {BaseNames : folders , URLs : urls }, nil
329- }
0 commit comments