From f3599926367a24633364d5ea1d8a9154b21985b9 Mon Sep 17 00:00:00 2001 From: Jake L Date: Tue, 31 Mar 2026 01:32:47 +0000 Subject: [PATCH 1/4] feat: add concat command to concatenate multiple WARC files --- cmd/warc/concat/concat.go | 187 ++++++++++++++++++++++++++++++++++++++ cmd/warc/main.go | 2 + 2 files changed, 189 insertions(+) create mode 100644 cmd/warc/concat/concat.go diff --git a/cmd/warc/concat/concat.go b/cmd/warc/concat/concat.go new file mode 100644 index 0000000..45ab05c --- /dev/null +++ b/cmd/warc/concat/concat.go @@ -0,0 +1,187 @@ +package concat + +import ( + "encoding/binary" + "fmt" + "io" + "log/slog" + "os" + "path/filepath" + "time" + + "github.com/spf13/cobra" +) + +// Command represents the concat command +var Command = &cobra.Command{ + Use: "concat [flags] file1.warc.gz file2.warc.gz ...", + Short: "Concatenate multiple WARC files into one and delete the originals", + Long: `Concatenate multiple WARC files into a single output WARC file. + +WARC files (including gzip-compressed ones) are simply concatenated at the +byte level. + +After a successful concatenation, the original input files are deleted unless +--no-delete is specified.`, + Args: cobra.MinimumNArgs(2), + Run: concat, +} + +func init() { + Command.Flags().StringP("output", "o", "", "Output WARC file path (required)") + Command.Flags().Bool("no-delete", false, "Keep original files after concatenation") + _ = Command.MarkFlagRequired("output") +} + +func concat(cmd *cobra.Command, files []string) { + output, err := cmd.Flags().GetString("output") + if err != nil { + slog.Error("failed to get output flag", "error", err) + return + } + + noDelete, err := cmd.Flags().GetBool("no-delete") + if err != nil { + slog.Error("failed to get no-delete flag", "error", err) + return + } + + startTime := time.Now() + + // Verify all input files exist, check for dictionary-compressed zstd, and collect sizes + var totalInputBytes int64 + for _, f := range files { + info, err := os.Stat(f) + if err != nil { + slog.Error("input file not accessible", "file", f, "error", err) + return + } + totalInputBytes += info.Size() + + if hasDictionaryFrame(f) { + slog.Error("file uses a zstd dictionary frame and cannot be safely concatenated at the byte level", + "file", f, + ) + return + } + } + + // Resolve absolute output path for clear logging + absOutput, err := filepath.Abs(output) + if err != nil { + absOutput = output + } + + slog.Info("concatenating WARC files", + "inputs", len(files), + "output", absOutput, + "totalInputBytes", totalInputBytes, + ) + + // Ensure the output directory exists + outputDir := filepath.Dir(absOutput) + if err := os.MkdirAll(outputDir, 0o755); err != nil { + slog.Error("failed to create output directory", "dir", outputDir, "error", err) + return + } + + // Detect if any input file is also the output path to prevent self-overwrite + for _, f := range files { + absInput, err := filepath.Abs(f) + if err != nil { + absInput = f + } + if absInput == absOutput { + slog.Error("output file is the same as one of the input files", "file", f) + return + } + } + + // Create (or truncate) the output file + out, err := os.Create(absOutput) + if err != nil { + slog.Error("failed to create output file", "file", absOutput, "error", err) + return + } + + // Track whether we completed successfully so we can clean up the output file on failure + success := false + defer func() { + if !success { + out.Close() + if removeErr := os.Remove(absOutput); removeErr != nil && !os.IsNotExist(removeErr) { + slog.Warn("failed to remove partial output file", "file", absOutput, "error", removeErr) + } + } + }() + + var totalWritten int64 + for _, f := range files { + written, err := copyFile(out, f) + if err != nil { + slog.Error("failed to copy file to output", "file", f, "output", absOutput, "error", err) + return + } + totalWritten += written + slog.Debug("appended file", "file", f, "bytes", written) + } + + if err := out.Close(); err != nil { + slog.Error("failed to close output file", "file", absOutput, "error", err) + return + } + + success = true + + slog.Info(fmt.Sprintf("concatenated in %s", time.Since(startTime).String()), + "output", absOutput, + "files", len(files), + "bytesWritten", totalWritten, + ) + + // Delete original files after successful concatenation + if !noDelete { + for _, f := range files { + if err := os.Remove(f); err != nil { + slog.Error("failed to delete original file", "file", f, "error", err) + } else { + slog.Info("deleted original file", "file", f) + } + } + } +} + +// hasDictionaryFrame reports whether a file begins with the zstd skippable dictionary +// frame written by gowarc (magic 0x184D2A5D, little-endian). Concatenating such files +// at the byte level is unsafe because each file embeds its own dictionary context. +func hasDictionaryFrame(path string) bool { + f, err := os.Open(path) + if err != nil { + return false + } + defer f.Close() + + var magic uint32 + if err := binary.Read(f, binary.LittleEndian, &magic); err != nil { + return false + } + + // 0x184D2A5D is the skippable-frame magic reserved for zstd dictionaries by the + // WARC-zstd spec: https://iipc.github.io/warc-specifications/specifications/warc-zstd/ + return magic == 0x184D2A5D +} + +// copyFile copies the contents of src into dst, returning the number of bytes written. +func copyFile(dst *os.File, src string) (int64, error) { + in, err := os.Open(src) + if err != nil { + return 0, fmt.Errorf("failed to open source file: %w", err) + } + defer in.Close() + + n, err := io.Copy(dst, in) + if err != nil { + return n, fmt.Errorf("failed to copy data: %w", err) + } + return n, nil +} diff --git a/cmd/warc/main.go b/cmd/warc/main.go index 0f21f22..6e162c0 100644 --- a/cmd/warc/main.go +++ b/cmd/warc/main.go @@ -4,6 +4,7 @@ import ( "log/slog" "os" + "github.com/internetarchive/gowarc/cmd/warc/concat" "github.com/internetarchive/gowarc/cmd/warc/extract" "github.com/internetarchive/gowarc/cmd/warc/mend" "github.com/internetarchive/gowarc/cmd/warc/verify" @@ -20,6 +21,7 @@ func init() { setupLogger(cmd) } + rootCmd.AddCommand(concat.Command) rootCmd.AddCommand(extract.Command) rootCmd.AddCommand(mend.Command) rootCmd.AddCommand(verify.Command) From f7f44d7588e9249700584894881fc6d8883c08b5 Mon Sep 17 00:00:00 2001 From: Jake L Date: Tue, 31 Mar 2026 02:07:07 +0000 Subject: [PATCH 2/4] feat: resolve comments --- cmd/warc/concat/concat.go | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/cmd/warc/concat/concat.go b/cmd/warc/concat/concat.go index 45ab05c..686c3fb 100644 --- a/cmd/warc/concat/concat.go +++ b/cmd/warc/concat/concat.go @@ -9,6 +9,7 @@ import ( "path/filepath" "time" + "github.com/internetarchive/gowarc/cmd/warc/utils" "github.com/spf13/cobra" ) @@ -56,6 +57,10 @@ func concat(cmd *cobra.Command, files []string) { slog.Error("input file not accessible", "file", f, "error", err) return } + if !info.Mode().IsRegular() { + slog.Error("input path is not a regular file", "file", f, "mode", info.Mode()) + return + } totalInputBytes += info.Size() if hasDictionaryFrame(f) { @@ -80,24 +85,18 @@ func concat(cmd *cobra.Command, files []string) { // Ensure the output directory exists outputDir := filepath.Dir(absOutput) - if err := os.MkdirAll(outputDir, 0o755); err != nil { + if err := os.MkdirAll(outputDir, utils.DefaultDirPermissions); err != nil { slog.Error("failed to create output directory", "dir", outputDir, "error", err) return } - // Detect if any input file is also the output path to prevent self-overwrite - for _, f := range files { - absInput, err := filepath.Abs(f) - if err != nil { - absInput = f - } - if absInput == absOutput { - slog.Error("output file is the same as one of the input files", "file", f) - return - } + // Detect if any input file is also the output path to prevent self-overwrite. + if _, err := os.Stat(absOutput); err == nil { + slog.Error("output file already exists", "file", absOutput) + return } - // Create (or truncate) the output file + // Create (or replace) the output file out, err := os.Create(absOutput) if err != nil { slog.Error("failed to create output file", "file", absOutput, "error", err) From a1e9f7ca399b9a79ae64632eb003a9c772d0b6fa Mon Sep 17 00:00:00 2001 From: Will Howes Date: Tue, 31 Mar 2026 17:52:16 -0500 Subject: [PATCH 3/4] feat: add and implement no-dedup flag --- cmd/warc/concat/concat.go | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/cmd/warc/concat/concat.go b/cmd/warc/concat/concat.go index 686c3fb..1c7b64a 100644 --- a/cmd/warc/concat/concat.go +++ b/cmd/warc/concat/concat.go @@ -7,6 +7,7 @@ import ( "log/slog" "os" "path/filepath" + "slices" "time" "github.com/internetarchive/gowarc/cmd/warc/utils" @@ -20,7 +21,7 @@ var Command = &cobra.Command{ Long: `Concatenate multiple WARC files into a single output WARC file. WARC files (including gzip-compressed ones) are simply concatenated at the -byte level. +byte level. The file list is sorted and deduplicated unless --no-dedup is specified. After a successful concatenation, the original input files are deleted unless --no-delete is specified.`, @@ -31,6 +32,7 @@ After a successful concatenation, the original input files are deleted unless func init() { Command.Flags().StringP("output", "o", "", "Output WARC file path (required)") Command.Flags().Bool("no-delete", false, "Keep original files after concatenation") + Command.Flags().Bool("no-dedup", false, "Don't sort and dedup file list") _ = Command.MarkFlagRequired("output") } @@ -47,8 +49,18 @@ func concat(cmd *cobra.Command, files []string) { return } + noDedup, err := cmd.Flags().GetBool("no-dedup") + if err != nil { + slog.Error("failed to get no-dedup flag", "error", err) + return + } startTime := time.Now() + if !noDedup { + slices.Sort(files) + files = slices.Compact(files) + } + // Verify all input files exist, check for dictionary-compressed zstd, and collect sizes var totalInputBytes int64 for _, f := range files { From 4342baf2613e4b92f909b6df448effe0c340ab50 Mon Sep 17 00:00:00 2001 From: Will Howes Date: Tue, 31 Mar 2026 18:03:45 -0500 Subject: [PATCH 4/4] docs: improve comments for output file logic --- cmd/warc/concat/concat.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmd/warc/concat/concat.go b/cmd/warc/concat/concat.go index 1c7b64a..f4e2f0f 100644 --- a/cmd/warc/concat/concat.go +++ b/cmd/warc/concat/concat.go @@ -102,13 +102,13 @@ func concat(cmd *cobra.Command, files []string) { return } - // Detect if any input file is also the output path to prevent self-overwrite. + // Verify the output filepath won't collide with any existing files if _, err := os.Stat(absOutput); err == nil { slog.Error("output file already exists", "file", absOutput) return } - // Create (or replace) the output file + // Create the output file out, err := os.Create(absOutput) if err != nil { slog.Error("failed to create output file", "file", absOutput, "error", err)