cmd: add corpus clean command to remove invalid sequences (crytic#777)

dguido · claude · anishnaik · web-flow · commit bc3d1d00d1cf · 2026-02-06T06:02:46.000-05:00
* cmd: add corpus clean command to remove invalid sequences After contract refactoring, the corpus may contain many invalid sequences that cannot be executed. This adds a new `medusa corpus clean` command to remove these invalid sequences from disk. The command: - Compiles contracts and sets up a test chain - Loads and validates each call sequence in the corpus - Removes sequences that fail to execute (contract resolution failures, ABI mismatches, or execution errors) - Supports --dry-run to preview what would be deleted - Reports statistics on valid/invalid sequences Fixes crytic#743 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * corpus: refactor corpus cleaner to match corpus_pruner pattern Move CorpusCleaner to fuzzing/corpus/ package to improve separation of concerns and match the organizational pattern of corpus_pruner.go. Changes: - Create fuzzing/corpus/corpus_cleaner.go with refactored CorpusCleaner that receives dependencies as parameters (no *Fuzzer dependency) - Update fuzzing/corpus_cleaner.go to be a thin wrapper that sets up dependencies and delegates to corpus.CorpusCleaner - Follows same pattern as corpus_pruner.go for consistency Benefits: - Better separation of concerns (corpus package independent of Fuzzer) - More testable (CorpusCleaner can be tested with mocked dependencies) - Consistent architecture within the corpus package Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com> * cmd: refactor corpus clean subcommand to remove dry-run and standardize architecture - Remove --dry-run flag completely (invalid sequences are always deleted) - Add CreateTestChainForCleaning() public helper to Fuzzer for CLI use - Delete fuzzing/corpus_cleaner.go wrapper (CLI now calls corpus package directly) - Standardize flag handling by creating cmd/corpus_flags.go - Move all cleaning logic to corpus package (no Fuzzer dependency) - Follow corpus_pruner pattern for clean separation of concerns Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com> Co-authored-by: anishnaik <anish.naik@trailofbits.com>
diff --git a/cmd/corpus.go b/cmd/corpus.go
@@ -0,0 +1,173 @@
+package cmd
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"os/signal"
+	"path/filepath"
+	"time"
+
+	"github.com/crytic/medusa/fuzzing"
+	"github.com/crytic/medusa/fuzzing/config"
+	"github.com/crytic/medusa/fuzzing/corpus"
+	"github.com/crytic/medusa/logging/colors"
+	"github.com/spf13/cobra"
+)
+
+// corpusCmd represents the corpus command group
+var corpusCmd = &cobra.Command{
+	Use:   "corpus",
+	Short: "Manage the fuzzing corpus",
+	Long:  `Commands for managing the fuzzing corpus, including cleaning invalid sequences.`,
+}
+
+// corpusCleanCmd represents the corpus clean subcommand
+var corpusCleanCmd = &cobra.Command{
+	Use:   "clean",
+	Short: "Remove invalid sequences from the corpus",
+	Long: `Validates each call sequence in the corpus by attempting to execute it on a test chain.
+Sequences that fail (due to contract changes, ABI mismatches, or execution errors) are removed from disk.
+
+This command is useful after refactoring contracts when the corpus contains many invalid sequences.`,
+	RunE:          cmdRunCorpusClean,
+	SilenceUsage:  true,
+	SilenceErrors: true,
+}
+
+func init() {
+	// Add flags
+	err := addCorpusCleanFlags()
+	if err != nil {
+		cmdLogger.Panic("Failed to initialize the corpus command", err)
+	}
+
+	// Add subcommands to corpus command
+	corpusCmd.AddCommand(corpusCleanCmd)
+
+	// Add corpus command to root
+	rootCmd.AddCommand(corpusCmd)
+}
+
+// cmdRunCorpusClean executes the corpus clean command
+func cmdRunCorpusClean(cmd *cobra.Command, args []string) error {
+	// Get config path from flag
+	configFlagUsed := cmd.Flags().Changed("config")
+	configPath, err := cmd.Flags().GetString("config")
+	if err != nil {
+		cmdLogger.Error("Failed to get config flag", err)
+		return err
+	}
+
+	if !configFlagUsed {
+		workingDirectory, err := os.Getwd()
+		if err != nil {
+			cmdLogger.Error("Failed to get working directory", err)
+			return err
+		}
+		configPath = filepath.Join(workingDirectory, DefaultProjectConfigFilename)
+	}
+
+	// Check if config file exists
+	if _, err := os.Stat(configPath); err != nil {
+		cmdLogger.Error("Config file not found", err)
+		return fmt.Errorf("config file not found at %s", configPath)
+	}
+
+	// Read config
+	cmdLogger.Info("Reading configuration file at: ", colors.Bold, configPath, colors.Reset)
+	projectConfig, err := config.ReadProjectConfigFromFile(configPath, DefaultCompilationPlatform)
+	if err != nil {
+		cmdLogger.Error("Failed to read config file", err)
+		return err
+	}
+
+	// Change to config directory
+	if err := os.Chdir(filepath.Dir(configPath)); err != nil {
+		cmdLogger.Error("Failed to change to config directory", err)
+		return err
+	}
+
+	// Check if corpus directory is configured
+	if projectConfig.Fuzzing.CorpusDirectory == "" {
+		cmdLogger.Error("No corpus directory configured", nil)
+		return fmt.Errorf("no corpus directory configured in %s", configPath)
+	}
+
+	// Check if corpus directory exists
+	corpusDir := projectConfig.Fuzzing.CorpusDirectory
+	if _, err := os.Stat(corpusDir); os.IsNotExist(err) {
+		cmdLogger.Error("Corpus directory does not exist", nil)
+		return fmt.Errorf("corpus directory does not exist: %s", corpusDir)
+	}
+
+	// Create fuzzer (this handles compilation and contract definitions)
+	cmdLogger.Info("Initializing fuzzer...")
+	fuzzer, err := fuzzing.NewFuzzer(*projectConfig)
+	if err != nil {
+		cmdLogger.Error("Failed to initialize fuzzer", err)
+		return err
+	}
+
+	// Create context with cancellation for interrupt handling
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
+	// Handle interrupt
+	sigCh := make(chan os.Signal, 1)
+	signal.Notify(sigCh, os.Interrupt)
+	go func() {
+		<-sigCh
+		cmdLogger.Info("Interrupted, stopping...")
+		cancel()
+	}()
+
+	// Create test chain and build deployed contracts map
+	cmdLogger.Info("Setting up test chain...")
+	testChain, deployedContracts, err := fuzzer.CreateTestChainForCleaning()
+	if err != nil {
+		cmdLogger.Error("Failed to setup test chain", err)
+		return err
+	}
+	defer testChain.Close()
+
+	// Create and initialize the corpus
+	cmdLogger.Info("Creating corpus...")
+	fuzzerCorpus, err := corpus.NewCorpus(projectConfig.Fuzzing.CorpusDirectory)
+	if err != nil {
+		cmdLogger.Error("Failed to create the corpus", err)
+		return err
+	}
+	err = fuzzerCorpus.Initialize(testChain, fuzzer.ContractDefinitions())
+	if err != nil {
+		cmdLogger.Error("Failed to initialize the corpus", err)
+		return err
+	}
+
+	cmdLogger.Info("Loading and validating corpus from: ", colors.Bold, corpusDir, colors.Reset)
+
+	// Create cleaner and run
+	cleaner := corpus.NewCorpusCleaner(fuzzerCorpus, cmdLogger)
+	start := time.Now()
+	result, err := cleaner.Clean(ctx, testChain, deployedContracts)
+	if err != nil {
+		cmdLogger.Error("Error during corpus cleaning", err)
+		return err
+	}
+	cmdLogger.Info("Corpus cleaning completed in ", time.Since(start).Round(time.Second))
+
+	// Report results
+	invalidCount := len(result.InvalidSequences)
+	cmdLogger.Info(
+		"Results: ",
+		colors.Bold, result.ValidSequences, colors.Reset, " valid, ",
+		colors.Bold, invalidCount, colors.Reset, " invalid out of ",
+		colors.Bold, result.TotalSequences, colors.Reset, " total sequences",
+	)
+
+	if invalidCount > 0 {
+		cmdLogger.Info(colors.Bold, invalidCount, colors.Reset, " invalid sequences removed from disk")
+	}
+
+	return nil
+}
diff --git a/cmd/corpus_flags.go b/cmd/corpus_flags.go
@@ -0,0 +1,13 @@
+package cmd
+
+// addCorpusCleanFlags adds flags for the corpus clean subcommand
+func addCorpusCleanFlags() error {
+	// Prevent alphabetical sorting of usage message
+	corpusCleanCmd.Flags().SortFlags = false
+
+	// Config file path
+	corpusCleanCmd.Flags().String("config", "",
+		"path to config file (default: medusa.json in current directory)")
+
+	return nil
+}
diff --git a/fuzzing/corpus/corpus.go b/fuzzing/corpus/corpus.go
@@ -574,3 +574,128 @@ func (c *Corpus) PruneSequences(ctx context.Context, chain *chain.TestChain) (in
 	c.mutationTargetSequenceChooser.RemoveChoices(toRemove)
 	return len(toRemove), nil
 }
+
+// CleanInvalidSequencesResult contains the results of cleaning invalid sequences from the corpus.
+type CleanInvalidSequencesResult struct {
+	// TotalSequences is the total number of sequences in the corpus before cleaning.
+	TotalSequences int
+	// ValidSequences is the number of sequences that were successfully executed.
+	ValidSequences int
+	// InvalidSequences is the list of filenames that were invalid and removed (or would be removed
+	// in dry-run mode).
+	InvalidSequences []string
+}
+
+// CleanInvalidSequences validates each call sequence in the corpus by attempting to execute it on
+// the provided test chain. Sequences that fail to execute (due to contract resolution failures,
+// ABI mismatches, or execution errors) are considered invalid and removed from disk.
+//
+// The deployedContracts map should contain the contracts deployed on the test chain, mapping
+// addresses to their contract definitions.
+//
+// Returns a CleanInvalidSequencesResult containing statistics about the cleaning operation, or an
+// error if one occurs.
+func (c *Corpus) CleanInvalidSequences(
+	ctx context.Context,
+	testChain *chain.TestChain,
+	deployedContracts map[common.Address]*contracts.Contract,
+) (*CleanInvalidSequencesResult, error) {
+	result := &CleanInvalidSequencesResult{
+		InvalidSequences: make([]string, 0),
+	}
+
+	// Get the chain's testing base index for reverting
+	chainOriginalIndex := uint64(len(testChain.CommittedBlocks()))
+
+	// Process call sequence files
+	c.callSequencesLock.Lock()
+	sequenceFiles := make([]*corpusFile[calls.CallSequence], len(c.callSequenceFiles.files))
+	copy(sequenceFiles, c.callSequenceFiles.files)
+	c.callSequencesLock.Unlock()
+
+	result.TotalSequences = len(sequenceFiles)
+
+	for _, seqFile := range sequenceFiles {
+		if utils.CheckContextDone(ctx) {
+			return result, nil
+		}
+
+		// Clone the sequence for validation
+		seq, err := seqFile.data.Clone()
+		if err != nil {
+			result.InvalidSequences = append(result.InvalidSequences, seqFile.fileName)
+			if _, removeErr := c.callSequenceFiles.removeFileFromDisk(seqFile.fileName); removeErr != nil {
+				c.logger.Warn("Failed to remove invalid sequence file: ", seqFile.fileName, " error: ", removeErr)
+			}
+			continue
+		}
+
+		// Try to bind and execute each element in the sequence
+		valid := true
+		for i, element := range seq {
+			// Skip contract creation calls or elements with nil Call
+			if element.Call == nil || element.Call.To == nil {
+				continue
+			}
+
+			// Try to resolve the contract
+			contractDef, ok := deployedContracts[*element.Call.To]
+			if !ok {
+				valid = false
+				break
+			}
+			element.Contract = contractDef
+
+			// Try to resolve ABI values if present
+			if abiValues := element.Call.DataAbiValues; abiValues != nil {
+				if err := abiValues.Resolve(contractDef.CompiledContract().Abi); err != nil {
+					valid = false
+					break
+				}
+			}
+
+			// Update the sequence with the bound element
+			seq[i] = element
+		}
+
+		// If binding succeeded, try to execute the sequence
+		if valid {
+			fetchElementFunc := func(currentIndex int) (*calls.CallSequenceElement, error) {
+				if currentIndex >= len(seq) {
+					return nil, nil
+				}
+				return seq[currentIndex], nil
+			}
+
+			// Execute without checking results - we just want to know if it runs without error
+			executionCheckFunc := func(_ calls.CallSequence) (bool, error) {
+				return false, nil
+			}
+
+			_, execErr := calls.ExecuteCallSequenceIteratively(
+				testChain,
+				fetchElementFunc,
+				executionCheckFunc,
+			)
+			if execErr != nil {
+				valid = false
+			}
+
+			// Revert chain state
+			if revertErr := testChain.RevertToBlockIndex(chainOriginalIndex); revertErr != nil {
+				return result, fmt.Errorf("failed to revert chain state: %w", revertErr)
+			}
+		}
+
+		if valid {
+			result.ValidSequences++
+		} else {
+			result.InvalidSequences = append(result.InvalidSequences, seqFile.fileName)
+			if _, removeErr := c.callSequenceFiles.removeFileFromDisk(seqFile.fileName); removeErr != nil {
+				c.logger.Warn("Failed to remove invalid sequence file: ", seqFile.fileName, " error: ", removeErr)
+			}
+		}
+	}
+
+	return result, nil
+}
diff --git a/fuzzing/corpus/corpus_cleaner.go b/fuzzing/corpus/corpus_cleaner.go
@@ -0,0 +1,65 @@
+package corpus
+
+import (
+	"context"
+	"fmt"
+
+	"github.com/crytic/medusa-geth/common"
+	"github.com/crytic/medusa/chain"
+	"github.com/crytic/medusa/fuzzing/contracts"
+	"github.com/crytic/medusa/logging"
+)
+
+// CorpusCleaner provides functionality to clean invalid sequences from a corpus.
+// It follows the same pattern as CorpusPruner by not depending on the Fuzzer type.
+type CorpusCleaner struct {
+	// corpus is the corpus to be cleaned
+	corpus *Corpus
+	// logger is used to log when cleaning and on error
+	logger *logging.Logger
+}
+
+// CleanResult contains the results of a corpus cleaning operation.
+type CleanResult struct {
+	// TotalSequences is the total number of sequences in the corpus before cleaning.
+	TotalSequences int
+	// ValidSequences is the number of sequences that executed successfully.
+	ValidSequences int
+	// InvalidSequences is the list of filenames that were invalid.
+	InvalidSequences []string
+}
+
+// NewCorpusCleaner creates a new CorpusCleaner.
+func NewCorpusCleaner(corpus *Corpus, logger *logging.Logger) *CorpusCleaner {
+	return &CorpusCleaner{
+		corpus: corpus,
+		logger: logger,
+	}
+}
+
+// Clean validates call sequences using the provided test chain and deployed contracts.
+// Sequences that fail to execute are removed from disk.
+// Returns the cleaning results and any error encountered.
+func (cc *CorpusCleaner) Clean(
+	ctx context.Context,
+	testChain *chain.TestChain,
+	deployedContracts map[common.Address]*contracts.Contract,
+) (*CleanResult, error) {
+	// Get base block index for reverting
+	chainBaseIndex := uint64(len(testChain.CommittedBlocks()))
+
+	// Use the corpus's cleaning method
+	cleanResult, err := cc.corpus.CleanInvalidSequences(ctx, testChain, deployedContracts)
+	if err != nil {
+		return nil, fmt.Errorf("error during corpus cleaning: %w", err)
+	}
+
+	// Revert to base state
+	_ = testChain.RevertToBlockIndex(chainBaseIndex)
+
+	return &CleanResult{
+		TotalSequences:   cleanResult.TotalSequences,
+		ValidSequences:   cleanResult.ValidSequences,
+		InvalidSequences: cleanResult.InvalidSequences,
+	}, nil
+}
diff --git a/fuzzing/corpus/corpus_files.go b/fuzzing/corpus/corpus_files.go
diff --git a/fuzzing/fuzzer.go b/fuzzing/fuzzer.go