Skip to content

Commit bc3d1d0

Browse files
dguidoclaudeanishnaik
authored
cmd: add corpus clean command to remove invalid sequences (crytic#777)
* cmd: add corpus clean command to remove invalid sequences After contract refactoring, the corpus may contain many invalid sequences that cannot be executed. This adds a new `medusa corpus clean` command to remove these invalid sequences from disk. The command: - Compiles contracts and sets up a test chain - Loads and validates each call sequence in the corpus - Removes sequences that fail to execute (contract resolution failures, ABI mismatches, or execution errors) - Supports --dry-run to preview what would be deleted - Reports statistics on valid/invalid sequences Fixes crytic#743 Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> * corpus: refactor corpus cleaner to match corpus_pruner pattern Move CorpusCleaner to fuzzing/corpus/ package to improve separation of concerns and match the organizational pattern of corpus_pruner.go. Changes: - Create fuzzing/corpus/corpus_cleaner.go with refactored CorpusCleaner that receives dependencies as parameters (no *Fuzzer dependency) - Update fuzzing/corpus_cleaner.go to be a thin wrapper that sets up dependencies and delegates to corpus.CorpusCleaner - Follows same pattern as corpus_pruner.go for consistency Benefits: - Better separation of concerns (corpus package independent of Fuzzer) - More testable (CorpusCleaner can be tested with mocked dependencies) - Consistent architecture within the corpus package Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com> * cmd: refactor corpus clean subcommand to remove dry-run and standardize architecture - Remove --dry-run flag completely (invalid sequences are always deleted) - Add CreateTestChainForCleaning() public helper to Fuzzer for CLI use - Delete fuzzing/corpus_cleaner.go wrapper (CLI now calls corpus package directly) - Standardize flag handling by creating cmd/corpus_flags.go - Move all cleaning logic to corpus package (no Fuzzer dependency) - Follow corpus_pruner pattern for clean separation of concerns Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com> Co-authored-by: anishnaik <anish.naik@trailofbits.com>
1 parent 5e9ece7 commit bc3d1d0

File tree

6 files changed

+459
-0
lines changed

6 files changed

+459
-0
lines changed

cmd/corpus.go

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
package cmd
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"os"
7+
"os/signal"
8+
"path/filepath"
9+
"time"
10+
11+
"github.com/crytic/medusa/fuzzing"
12+
"github.com/crytic/medusa/fuzzing/config"
13+
"github.com/crytic/medusa/fuzzing/corpus"
14+
"github.com/crytic/medusa/logging/colors"
15+
"github.com/spf13/cobra"
16+
)
17+
18+
// corpusCmd represents the corpus command group
19+
var corpusCmd = &cobra.Command{
20+
Use: "corpus",
21+
Short: "Manage the fuzzing corpus",
22+
Long: `Commands for managing the fuzzing corpus, including cleaning invalid sequences.`,
23+
}
24+
25+
// corpusCleanCmd represents the corpus clean subcommand
26+
var corpusCleanCmd = &cobra.Command{
27+
Use: "clean",
28+
Short: "Remove invalid sequences from the corpus",
29+
Long: `Validates each call sequence in the corpus by attempting to execute it on a test chain.
30+
Sequences that fail (due to contract changes, ABI mismatches, or execution errors) are removed from disk.
31+
32+
This command is useful after refactoring contracts when the corpus contains many invalid sequences.`,
33+
RunE: cmdRunCorpusClean,
34+
SilenceUsage: true,
35+
SilenceErrors: true,
36+
}
37+
38+
func init() {
39+
// Add flags
40+
err := addCorpusCleanFlags()
41+
if err != nil {
42+
cmdLogger.Panic("Failed to initialize the corpus command", err)
43+
}
44+
45+
// Add subcommands to corpus command
46+
corpusCmd.AddCommand(corpusCleanCmd)
47+
48+
// Add corpus command to root
49+
rootCmd.AddCommand(corpusCmd)
50+
}
51+
52+
// cmdRunCorpusClean executes the corpus clean command
53+
func cmdRunCorpusClean(cmd *cobra.Command, args []string) error {
54+
// Get config path from flag
55+
configFlagUsed := cmd.Flags().Changed("config")
56+
configPath, err := cmd.Flags().GetString("config")
57+
if err != nil {
58+
cmdLogger.Error("Failed to get config flag", err)
59+
return err
60+
}
61+
62+
if !configFlagUsed {
63+
workingDirectory, err := os.Getwd()
64+
if err != nil {
65+
cmdLogger.Error("Failed to get working directory", err)
66+
return err
67+
}
68+
configPath = filepath.Join(workingDirectory, DefaultProjectConfigFilename)
69+
}
70+
71+
// Check if config file exists
72+
if _, err := os.Stat(configPath); err != nil {
73+
cmdLogger.Error("Config file not found", err)
74+
return fmt.Errorf("config file not found at %s", configPath)
75+
}
76+
77+
// Read config
78+
cmdLogger.Info("Reading configuration file at: ", colors.Bold, configPath, colors.Reset)
79+
projectConfig, err := config.ReadProjectConfigFromFile(configPath, DefaultCompilationPlatform)
80+
if err != nil {
81+
cmdLogger.Error("Failed to read config file", err)
82+
return err
83+
}
84+
85+
// Change to config directory
86+
if err := os.Chdir(filepath.Dir(configPath)); err != nil {
87+
cmdLogger.Error("Failed to change to config directory", err)
88+
return err
89+
}
90+
91+
// Check if corpus directory is configured
92+
if projectConfig.Fuzzing.CorpusDirectory == "" {
93+
cmdLogger.Error("No corpus directory configured", nil)
94+
return fmt.Errorf("no corpus directory configured in %s", configPath)
95+
}
96+
97+
// Check if corpus directory exists
98+
corpusDir := projectConfig.Fuzzing.CorpusDirectory
99+
if _, err := os.Stat(corpusDir); os.IsNotExist(err) {
100+
cmdLogger.Error("Corpus directory does not exist", nil)
101+
return fmt.Errorf("corpus directory does not exist: %s", corpusDir)
102+
}
103+
104+
// Create fuzzer (this handles compilation and contract definitions)
105+
cmdLogger.Info("Initializing fuzzer...")
106+
fuzzer, err := fuzzing.NewFuzzer(*projectConfig)
107+
if err != nil {
108+
cmdLogger.Error("Failed to initialize fuzzer", err)
109+
return err
110+
}
111+
112+
// Create context with cancellation for interrupt handling
113+
ctx, cancel := context.WithCancel(context.Background())
114+
defer cancel()
115+
116+
// Handle interrupt
117+
sigCh := make(chan os.Signal, 1)
118+
signal.Notify(sigCh, os.Interrupt)
119+
go func() {
120+
<-sigCh
121+
cmdLogger.Info("Interrupted, stopping...")
122+
cancel()
123+
}()
124+
125+
// Create test chain and build deployed contracts map
126+
cmdLogger.Info("Setting up test chain...")
127+
testChain, deployedContracts, err := fuzzer.CreateTestChainForCleaning()
128+
if err != nil {
129+
cmdLogger.Error("Failed to setup test chain", err)
130+
return err
131+
}
132+
defer testChain.Close()
133+
134+
// Create and initialize the corpus
135+
cmdLogger.Info("Creating corpus...")
136+
fuzzerCorpus, err := corpus.NewCorpus(projectConfig.Fuzzing.CorpusDirectory)
137+
if err != nil {
138+
cmdLogger.Error("Failed to create the corpus", err)
139+
return err
140+
}
141+
err = fuzzerCorpus.Initialize(testChain, fuzzer.ContractDefinitions())
142+
if err != nil {
143+
cmdLogger.Error("Failed to initialize the corpus", err)
144+
return err
145+
}
146+
147+
cmdLogger.Info("Loading and validating corpus from: ", colors.Bold, corpusDir, colors.Reset)
148+
149+
// Create cleaner and run
150+
cleaner := corpus.NewCorpusCleaner(fuzzerCorpus, cmdLogger)
151+
start := time.Now()
152+
result, err := cleaner.Clean(ctx, testChain, deployedContracts)
153+
if err != nil {
154+
cmdLogger.Error("Error during corpus cleaning", err)
155+
return err
156+
}
157+
cmdLogger.Info("Corpus cleaning completed in ", time.Since(start).Round(time.Second))
158+
159+
// Report results
160+
invalidCount := len(result.InvalidSequences)
161+
cmdLogger.Info(
162+
"Results: ",
163+
colors.Bold, result.ValidSequences, colors.Reset, " valid, ",
164+
colors.Bold, invalidCount, colors.Reset, " invalid out of ",
165+
colors.Bold, result.TotalSequences, colors.Reset, " total sequences",
166+
)
167+
168+
if invalidCount > 0 {
169+
cmdLogger.Info(colors.Bold, invalidCount, colors.Reset, " invalid sequences removed from disk")
170+
}
171+
172+
return nil
173+
}

cmd/corpus_flags.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
package cmd
2+
3+
// addCorpusCleanFlags adds flags for the corpus clean subcommand
4+
func addCorpusCleanFlags() error {
5+
// Prevent alphabetical sorting of usage message
6+
corpusCleanCmd.Flags().SortFlags = false
7+
8+
// Config file path
9+
corpusCleanCmd.Flags().String("config", "",
10+
"path to config file (default: medusa.json in current directory)")
11+
12+
return nil
13+
}

fuzzing/corpus/corpus.go

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -574,3 +574,128 @@ func (c *Corpus) PruneSequences(ctx context.Context, chain *chain.TestChain) (in
574574
c.mutationTargetSequenceChooser.RemoveChoices(toRemove)
575575
return len(toRemove), nil
576576
}
577+
578+
// CleanInvalidSequencesResult contains the results of cleaning invalid sequences from the corpus.
579+
type CleanInvalidSequencesResult struct {
580+
// TotalSequences is the total number of sequences in the corpus before cleaning.
581+
TotalSequences int
582+
// ValidSequences is the number of sequences that were successfully executed.
583+
ValidSequences int
584+
// InvalidSequences is the list of filenames that were invalid and removed (or would be removed
585+
// in dry-run mode).
586+
InvalidSequences []string
587+
}
588+
589+
// CleanInvalidSequences validates each call sequence in the corpus by attempting to execute it on
590+
// the provided test chain. Sequences that fail to execute (due to contract resolution failures,
591+
// ABI mismatches, or execution errors) are considered invalid and removed from disk.
592+
//
593+
// The deployedContracts map should contain the contracts deployed on the test chain, mapping
594+
// addresses to their contract definitions.
595+
//
596+
// Returns a CleanInvalidSequencesResult containing statistics about the cleaning operation, or an
597+
// error if one occurs.
598+
func (c *Corpus) CleanInvalidSequences(
599+
ctx context.Context,
600+
testChain *chain.TestChain,
601+
deployedContracts map[common.Address]*contracts.Contract,
602+
) (*CleanInvalidSequencesResult, error) {
603+
result := &CleanInvalidSequencesResult{
604+
InvalidSequences: make([]string, 0),
605+
}
606+
607+
// Get the chain's testing base index for reverting
608+
chainOriginalIndex := uint64(len(testChain.CommittedBlocks()))
609+
610+
// Process call sequence files
611+
c.callSequencesLock.Lock()
612+
sequenceFiles := make([]*corpusFile[calls.CallSequence], len(c.callSequenceFiles.files))
613+
copy(sequenceFiles, c.callSequenceFiles.files)
614+
c.callSequencesLock.Unlock()
615+
616+
result.TotalSequences = len(sequenceFiles)
617+
618+
for _, seqFile := range sequenceFiles {
619+
if utils.CheckContextDone(ctx) {
620+
return result, nil
621+
}
622+
623+
// Clone the sequence for validation
624+
seq, err := seqFile.data.Clone()
625+
if err != nil {
626+
result.InvalidSequences = append(result.InvalidSequences, seqFile.fileName)
627+
if _, removeErr := c.callSequenceFiles.removeFileFromDisk(seqFile.fileName); removeErr != nil {
628+
c.logger.Warn("Failed to remove invalid sequence file: ", seqFile.fileName, " error: ", removeErr)
629+
}
630+
continue
631+
}
632+
633+
// Try to bind and execute each element in the sequence
634+
valid := true
635+
for i, element := range seq {
636+
// Skip contract creation calls or elements with nil Call
637+
if element.Call == nil || element.Call.To == nil {
638+
continue
639+
}
640+
641+
// Try to resolve the contract
642+
contractDef, ok := deployedContracts[*element.Call.To]
643+
if !ok {
644+
valid = false
645+
break
646+
}
647+
element.Contract = contractDef
648+
649+
// Try to resolve ABI values if present
650+
if abiValues := element.Call.DataAbiValues; abiValues != nil {
651+
if err := abiValues.Resolve(contractDef.CompiledContract().Abi); err != nil {
652+
valid = false
653+
break
654+
}
655+
}
656+
657+
// Update the sequence with the bound element
658+
seq[i] = element
659+
}
660+
661+
// If binding succeeded, try to execute the sequence
662+
if valid {
663+
fetchElementFunc := func(currentIndex int) (*calls.CallSequenceElement, error) {
664+
if currentIndex >= len(seq) {
665+
return nil, nil
666+
}
667+
return seq[currentIndex], nil
668+
}
669+
670+
// Execute without checking results - we just want to know if it runs without error
671+
executionCheckFunc := func(_ calls.CallSequence) (bool, error) {
672+
return false, nil
673+
}
674+
675+
_, execErr := calls.ExecuteCallSequenceIteratively(
676+
testChain,
677+
fetchElementFunc,
678+
executionCheckFunc,
679+
)
680+
if execErr != nil {
681+
valid = false
682+
}
683+
684+
// Revert chain state
685+
if revertErr := testChain.RevertToBlockIndex(chainOriginalIndex); revertErr != nil {
686+
return result, fmt.Errorf("failed to revert chain state: %w", revertErr)
687+
}
688+
}
689+
690+
if valid {
691+
result.ValidSequences++
692+
} else {
693+
result.InvalidSequences = append(result.InvalidSequences, seqFile.fileName)
694+
if _, removeErr := c.callSequenceFiles.removeFileFromDisk(seqFile.fileName); removeErr != nil {
695+
c.logger.Warn("Failed to remove invalid sequence file: ", seqFile.fileName, " error: ", removeErr)
696+
}
697+
}
698+
}
699+
700+
return result, nil
701+
}

fuzzing/corpus/corpus_cleaner.go

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
package corpus
2+
3+
import (
4+
"context"
5+
"fmt"
6+
7+
"github.com/crytic/medusa-geth/common"
8+
"github.com/crytic/medusa/chain"
9+
"github.com/crytic/medusa/fuzzing/contracts"
10+
"github.com/crytic/medusa/logging"
11+
)
12+
13+
// CorpusCleaner provides functionality to clean invalid sequences from a corpus.
14+
// It follows the same pattern as CorpusPruner by not depending on the Fuzzer type.
15+
type CorpusCleaner struct {
16+
// corpus is the corpus to be cleaned
17+
corpus *Corpus
18+
// logger is used to log when cleaning and on error
19+
logger *logging.Logger
20+
}
21+
22+
// CleanResult contains the results of a corpus cleaning operation.
23+
type CleanResult struct {
24+
// TotalSequences is the total number of sequences in the corpus before cleaning.
25+
TotalSequences int
26+
// ValidSequences is the number of sequences that executed successfully.
27+
ValidSequences int
28+
// InvalidSequences is the list of filenames that were invalid.
29+
InvalidSequences []string
30+
}
31+
32+
// NewCorpusCleaner creates a new CorpusCleaner.
33+
func NewCorpusCleaner(corpus *Corpus, logger *logging.Logger) *CorpusCleaner {
34+
return &CorpusCleaner{
35+
corpus: corpus,
36+
logger: logger,
37+
}
38+
}
39+
40+
// Clean validates call sequences using the provided test chain and deployed contracts.
41+
// Sequences that fail to execute are removed from disk.
42+
// Returns the cleaning results and any error encountered.
43+
func (cc *CorpusCleaner) Clean(
44+
ctx context.Context,
45+
testChain *chain.TestChain,
46+
deployedContracts map[common.Address]*contracts.Contract,
47+
) (*CleanResult, error) {
48+
// Get base block index for reverting
49+
chainBaseIndex := uint64(len(testChain.CommittedBlocks()))
50+
51+
// Use the corpus's cleaning method
52+
cleanResult, err := cc.corpus.CleanInvalidSequences(ctx, testChain, deployedContracts)
53+
if err != nil {
54+
return nil, fmt.Errorf("error during corpus cleaning: %w", err)
55+
}
56+
57+
// Revert to base state
58+
_ = testChain.RevertToBlockIndex(chainBaseIndex)
59+
60+
return &CleanResult{
61+
TotalSequences: cleanResult.TotalSequences,
62+
ValidSequences: cleanResult.ValidSequences,
63+
InvalidSequences: cleanResult.InvalidSequences,
64+
}, nil
65+
}

0 commit comments

Comments
 (0)