Skip to content

Commit 73c7dde

Browse files
CopilotNGTmeaty
andauthored
Add Zeno get list command for archiving URLs from files (#514)
* Initial plan * Add Zeno get list command to archive URLs from files - Created get_list.go with support for local and remote files - Added helper functions to read URLs from files (supporting comments and empty lines) - Registered list command in get.go - Added e2e test for the list command - Updated e2e package with helper functions for list command testing Co-authored-by: NGTmeaty <2244519+NGTmeaty@users.noreply.github.com> * Remove redundant validation check in get_list The MinimumNArgs(1) already ensures at least one argument is provided Co-authored-by: NGTmeaty <2244519+NGTmeaty@users.noreply.github.com> * Address PR feedback: improve list command and test - Add missing urls.txt test file with multiple URLs - Update command Use field to specify FILE|URL - Set default user agent for remote file downloads - Enhance test to verify all URLs from list are archived Co-authored-by: NGTmeaty <2244519+NGTmeaty@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: NGTmeaty <2244519+NGTmeaty@users.noreply.github.com>
1 parent 2abbcd3 commit 73c7dde

File tree

6 files changed

+259
-3
lines changed

6 files changed

+259
-3
lines changed

cmd/get.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ func getCMDs() *cobra.Command {
2222
getHQCmdFlags(getHQCmd)
2323

2424
getCmd.AddCommand(getURLCmd)
25+
getCmd.AddCommand(getListCmd)
2526
getCmd.AddCommand(getHQCmd)
2627

2728
return getCmd

cmd/get_list.go

Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
package cmd
2+
3+
import (
4+
"bufio"
5+
"fmt"
6+
"net/http"
7+
"os"
8+
"strings"
9+
"time"
10+
11+
"github.com/internetarchive/Zeno/internal/pkg/config"
12+
"github.com/internetarchive/Zeno/internal/pkg/controler"
13+
"github.com/internetarchive/Zeno/internal/pkg/ui"
14+
"github.com/spf13/cobra"
15+
)
16+
17+
var getListCmd = &cobra.Command{
18+
Use: "list [FILE|URL...]",
19+
Short: "Archive URLs from text file(s)",
20+
Long: `Archive URLs from one or more text files or URLs.
21+
Each file should contain one URL per line.
22+
Remote files (starting with http:// or https://) are supported.
23+
Empty lines and lines starting with # are ignored.`,
24+
Args: cobra.MinimumNArgs(1),
25+
PreRunE: func(_ *cobra.Command, args []string) error {
26+
if cfg == nil {
27+
return fmt.Errorf("viper config is nil")
28+
}
29+
30+
return nil
31+
},
32+
RunE: func(_ *cobra.Command, args []string) error {
33+
// Read URLs from all provided files
34+
for _, file := range args {
35+
var urls []string
36+
var err error
37+
38+
if strings.HasPrefix(file, "http://") || strings.HasPrefix(file, "https://") {
39+
urls, err = readRemoteURLList(file)
40+
} else {
41+
urls, err = readLocalURLList(file)
42+
}
43+
44+
if err != nil {
45+
return fmt.Errorf("error reading file %s: %w", file, err)
46+
}
47+
48+
// Add URLs to config
49+
config.Get().InputSeeds = append(config.Get().InputSeeds, urls...)
50+
}
51+
52+
if len(config.Get().InputSeeds) == 0 {
53+
return fmt.Errorf("no URLs found in provided files")
54+
}
55+
56+
err := config.GenerateCrawlConfig()
57+
if err != nil {
58+
return err
59+
}
60+
61+
if cfg.PyroscopeAddress != "" {
62+
err = startPyroscope()
63+
if err != nil {
64+
return err
65+
}
66+
}
67+
68+
if cfg.SentryDSN != "" {
69+
err = startSentry()
70+
if err != nil {
71+
return err
72+
}
73+
}
74+
75+
controler.Start()
76+
if config.Get().TUI {
77+
tui := ui.New()
78+
err := tui.Start()
79+
if err != nil {
80+
return fmt.Errorf("error starting TUI: %w", err)
81+
}
82+
} else {
83+
controler.WatchSignals()
84+
}
85+
return nil
86+
},
87+
}
88+
89+
// readLocalURLList reads URLs from a local file
90+
func readLocalURLList(file string) (urls []string, err error) {
91+
f, err := os.Open(file)
92+
if err != nil {
93+
return urls, err
94+
}
95+
defer f.Close()
96+
97+
scanner := bufio.NewScanner(f)
98+
for scanner.Scan() {
99+
line := strings.TrimSpace(scanner.Text())
100+
if line == "" || strings.HasPrefix(line, "#") {
101+
continue
102+
}
103+
urls = append(urls, line)
104+
}
105+
106+
return urls, scanner.Err()
107+
}
108+
109+
// readRemoteURLList reads URLs from a remote file (http/https)
110+
func readRemoteURLList(URL string) (urls []string, err error) {
111+
httpClient := &http.Client{
112+
Timeout: time.Second * 30,
113+
}
114+
115+
req, err := http.NewRequest(http.MethodGet, URL, nil)
116+
if err != nil {
117+
return urls, err
118+
}
119+
120+
// Set user agent, use default if not configured
121+
userAgent := config.Get().UserAgent
122+
if userAgent == "" {
123+
userAgent = "Mozilla/5.0 (compatible; Zeno)"
124+
}
125+
req.Header.Set("User-Agent", userAgent)
126+
127+
resp, err := httpClient.Do(req)
128+
if err != nil {
129+
return urls, err
130+
}
131+
132+
defer resp.Body.Close()
133+
134+
if resp.StatusCode != http.StatusOK {
135+
return urls, fmt.Errorf("failed to download URL list: %s", resp.Status)
136+
}
137+
138+
// Read file line by line
139+
scanner := bufio.NewScanner(resp.Body)
140+
for scanner.Scan() {
141+
line := strings.TrimSpace(scanner.Text())
142+
if line == "" || strings.HasPrefix(line, "#") {
143+
continue
144+
}
145+
urls = append(urls, line)
146+
}
147+
return urls, scanner.Err()
148+
}

e2e/e2e.go

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,16 +23,16 @@ var DialTimeout = 10 * time.Second
2323

2424
func cmdZenoGetURL(urls []string) *cobra.Command {
2525
cmd := cmd.Prepare()
26-
26+
2727
// If config.toml exists in pwd, include it in args
2828
var args []string
2929
commonArgs := []string{"get", "url", "--log-e2e", "--log-e2e-level", "debug", "--no-stdout-log", "--no-stderr-log"}
3030
if _, err := os.Stat("config.toml"); err == nil {
31-
args = append(commonArgs,append([]string{"--config-file", "config.toml"}, urls...)...)
31+
args = append(commonArgs, append([]string{"--config-file", "config.toml"}, urls...)...)
3232
} else {
3333
args = append(commonArgs, urls...)
3434
}
35-
35+
3636
fmt.Println("Command arguments:", args)
3737
cmd.SetArgs(args)
3838
return cmd
@@ -71,6 +71,23 @@ func connectThenCopy(t *testing.T, wg *sync.WaitGroup, W *io.PipeWriter) {
7171
defer W.Close()
7272
}
7373

74+
func cmdZenoGetList(files []string) *cobra.Command {
75+
cmd := cmd.Prepare()
76+
77+
// If config.toml exists in pwd, include it in args
78+
var args []string
79+
commonArgs := []string{"get", "list", "--log-e2e", "--log-e2e-level", "debug", "--no-stdout-log", "--no-stderr-log"}
80+
if _, err := os.Stat("config.toml"); err == nil {
81+
args = append(commonArgs, append([]string{"--config-file", "config.toml"}, files...)...)
82+
} else {
83+
args = append(commonArgs, files...)
84+
}
85+
86+
fmt.Println("Command arguments:", args)
87+
cmd.SetArgs(args)
88+
return cmd
89+
}
90+
7491
// ExecuteCmdZenoGetURL executes the Zeno get URL command with the e2e logging, URLs and custom config.toml file
7592
func ExecuteCmdZenoGetURL(t *testing.T, wg *sync.WaitGroup, urls []string) {
7693
defer wg.Done()
@@ -80,6 +97,15 @@ func ExecuteCmdZenoGetURL(t *testing.T, wg *sync.WaitGroup, urls []string) {
8097
}
8198
}
8299

100+
// ExecuteCmdZenoGetList executes the Zeno get list command with the e2e logging, file paths and custom config.toml file
101+
func ExecuteCmdZenoGetList(t *testing.T, wg *sync.WaitGroup, files []string) {
102+
defer wg.Done()
103+
cmdErr := cmdZenoGetList(files).Execute()
104+
if cmdErr != nil {
105+
t.Errorf("failed to start command: %v", cmdErr)
106+
}
107+
}
108+
83109
// Connects to the log conn and processes log records using the provided RecordMatcher
84110
func StartHandleLogRecord(t *testing.T, wg *sync.WaitGroup, rm log.RecordMatcher, stopCh chan struct{}) {
85111
defer wg.Done()

e2e/test/getlist/config.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
max-retry = 0 # avoid waiting for retries to speed up test

e2e/test/getlist/getlist_test.go

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
package getlist
2+
3+
import (
4+
_ "embed"
5+
"os"
6+
"strings"
7+
"sync"
8+
"testing"
9+
10+
"github.com/internetarchive/Zeno/e2e"
11+
)
12+
13+
type recordMatcher struct {
14+
archivedURLs map[string]bool
15+
unexpectedError bool
16+
}
17+
18+
func newRecordMatcher() *recordMatcher {
19+
return &recordMatcher{
20+
archivedURLs: make(map[string]bool),
21+
}
22+
}
23+
24+
func (rm *recordMatcher) Match(record map[string]string) {
25+
if record["level"] == "INFO" {
26+
if strings.Contains(record["msg"], "url archived") {
27+
// Extract URL from the log record
28+
if url, ok := record["url"]; ok {
29+
rm.archivedURLs[url] = true
30+
}
31+
}
32+
}
33+
if record["level"] == "ERROR" {
34+
rm.unexpectedError = true
35+
}
36+
}
37+
38+
func (rm *recordMatcher) Assert(t *testing.T) {
39+
expectedURLs := []string{
40+
"https://example.com/",
41+
"https://example.com/page1",
42+
"https://example.com/page2",
43+
}
44+
45+
for _, expectedURL := range expectedURLs {
46+
if !rm.archivedURLs[expectedURL] {
47+
t.Errorf("Zeno did not archive expected URL: %s", expectedURL)
48+
}
49+
}
50+
51+
if rm.unexpectedError {
52+
t.Error("An unexpected error was logged during the test")
53+
}
54+
}
55+
56+
func (rm *recordMatcher) ShouldStop() bool {
57+
// Stop when we've archived all 3 expected URLs or encountered an error
58+
return len(rm.archivedURLs) >= 3 || rm.unexpectedError
59+
}
60+
61+
func TestGetList(t *testing.T) {
62+
os.RemoveAll("jobs")
63+
defer os.RemoveAll("jobs")
64+
65+
shouldStopCh := make(chan struct{})
66+
rm := newRecordMatcher()
67+
wg := &sync.WaitGroup{}
68+
69+
wg.Add(2)
70+
71+
go e2e.StartHandleLogRecord(t, wg, rm, shouldStopCh)
72+
go e2e.ExecuteCmdZenoGetList(t, wg, []string{"urls.txt"})
73+
74+
e2e.WaitForGoroutines(t, wg, shouldStopCh)
75+
rm.Assert(t)
76+
}

e2e/test/getlist/urls.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
# Test URL list for e2e test
2+
https://example.com
3+
https://example.com/page1
4+
https://example.com/page2

0 commit comments

Comments
 (0)