Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

adding archive entry paths #3638

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
Open
11 changes: 11 additions & 0 deletions pkg/engine/engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -1155,6 +1155,17 @@ func (e *Engine) processResult(
return
}

// Add in handler metadata
if res.ExtraData == nil && data.chunk.HandleMetadata != nil {
res.ExtraData = data.chunk.HandleMetadata
} else {
for k, v := range data.chunk.HandleMetadata {
if _, ok := res.ExtraData[k]; !ok {
res.ExtraData[k] = v
}
joeleonjr marked this conversation as resolved.
Show resolved Hide resolved
}
}

secret := detectors.CopyMetadata(&data.chunk, res)
secret.DecoderType = data.decoder
secret.DetectorDescription = data.detector.Detector.Description()
Expand Down
4 changes: 3 additions & 1 deletion pkg/handlers/apk.go
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,9 @@ func (h *apkHandler) handleAPKFileContent(
ctx,
"filename", fileName,
)
return h.handleNonArchiveContent(ctx, mimeReader, apkChan)
//Note: the *zip.File.Name attribute (passed into this function as fileName)
// includes the full path within the apk.
return h.handleNonArchiveContent(ctx, fileName, mimeReader, apkChan)
}

// createZipReader creates a new ZIP reader from the input fileReader.
Expand Down
3 changes: 2 additions & 1 deletion pkg/handlers/ar.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,8 @@ func (h *arHandler) processARFiles(ctx logContext.Context, reader *deb.Ar, dataO
continue
}

if err := h.handleNonArchiveContent(fileCtx, rdr, dataOrErrChan); err != nil {
// Note: emptyFilePath is used as the archiveEntryPath value b/c the `ar` format is a flat archive format (no nested dirs).
if err := h.handleNonArchiveContent(fileCtx, emptyFilePath, rdr, dataOrErrChan); err != nil {
dataOrErrChan <- DataOrErr{
Err: fmt.Errorf("%w: error handling archive content in AR: %v", ErrProcessingWarning, err),
}
Expand Down
23 changes: 14 additions & 9 deletions pkg/handlers/archive.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ func (h *archiveHandler) HandleFile(ctx logContext.Context, input fileReader) ch
}()

start := time.Now()
err := h.openArchive(ctx, 0, input, dataOrErrChan)
err := h.openArchive(ctx, 0, emptyFilePath, input, dataOrErrChan)
if err == nil {
h.metrics.incFilesProcessed()
}
Expand All @@ -101,12 +101,13 @@ func (h *archiveHandler) HandleFile(ctx logContext.Context, input fileReader) ch
var ErrMaxDepthReached = errors.New("max archive depth reached")

// openArchive recursively extracts content from an archive up to a maximum depth, handling nested archives if necessary.
// It takes a reader from which it attempts to identify and process the archive format. Depending on the archive type,
// it either decompresses or extracts the contents directly, sending data to the provided channel.
// It takes a string representing the path to the archive and a reader from which it attempts to identify and process the archive format.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This comment reads "a string" but the actual function takes a string slice. Which is correct?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll fix that. That comment was from a previous version.

// Depending on the archive type, it either decompresses or extracts the contents directly, sending data to the provided channel.
// Returns an error if the archive cannot be processed due to issues like exceeding maximum depth or unsupported formats.
func (h *archiveHandler) openArchive(
ctx logContext.Context,
depth int,
archiveEntryPath string,
joeleonjr marked this conversation as resolved.
Show resolved Hide resolved
reader fileReader,
dataOrErrChan chan DataOrErr,
) error {
Expand All @@ -124,14 +125,14 @@ func (h *archiveHandler) openArchive(

if reader.format == nil {
if depth > 0 {
return h.handleNonArchiveContent(ctx, newMimeTypeReaderFromFileReader(reader), dataOrErrChan)
return h.handleNonArchiveContent(ctx, archiveEntryPath, newMimeTypeReaderFromFileReader(reader), dataOrErrChan)
}
return fmt.Errorf("unknown archive format")
}

switch archive := reader.format.(type) {
case archiver.Decompressor:
// Decompress tha archive and feed the decompressed data back into the archive handler to extract any nested archives.
// Decompress the archive and feed the decompressed data back into the archive handler to extract any nested archives.
compReader, err := archive.OpenReader(reader)
if err != nil {
return fmt.Errorf("error opening decompressor with format: %s %w", reader.format.Name(), err)
Expand All @@ -152,9 +153,13 @@ func (h *archiveHandler) openArchive(
}
defer rdr.Close()

return h.openArchive(ctx, depth+1, rdr, dataOrErrChan)
// Note: We're limited in our ability to add file names to the archiveEntryPath here, as the decompressor doesn't have access to a fileName value. Instead, we're adding a generic string to indicate that the file is decompressed. This could be improved.
joeleonjr marked this conversation as resolved.
Show resolved Hide resolved
if depth > 0 {
archiveEntryPath = archiveEntryPath + "(decompressed " + reader.format.Name() + " file)"
}
return h.openArchive(ctx, depth+1, archiveEntryPath, rdr, dataOrErrChan)
case archiver.Extractor:
err := archive.Extract(logContext.WithValue(ctx, depthKey, depth+1), reader, nil, h.extractorHandler(dataOrErrChan))
err := archive.Extract(logContext.WithValue(ctx, depthKey, depth+1), reader, nil, h.extractorHandler(archiveEntryPath, dataOrErrChan))
joeleonjr marked this conversation as resolved.
Show resolved Hide resolved
if err != nil {
return fmt.Errorf("error extracting archive with format: %s: %w", reader.format.Name(), err)
}
Expand All @@ -168,7 +173,7 @@ func (h *archiveHandler) openArchive(
// It logs the extraction, checks for cancellation, and decides whether to skip the file based on its name or type,
// particularly for binary files if configured to skip. If the file is not skipped, it recursively calls openArchive
// to handle nested archives or to continue processing based on the file's content and depth in the archive structure.
func (h *archiveHandler) extractorHandler(dataOrErrChan chan DataOrErr) func(context.Context, archiver.File) error {
func (h *archiveHandler) extractorHandler(archiveEntryPath string, dataOrErrChan chan DataOrErr) func(context.Context, archiver.File) error {
return func(ctx context.Context, file archiver.File) error {
lCtx := logContext.WithValues(
logContext.AddLogger(ctx),
Expand Down Expand Up @@ -240,6 +245,6 @@ func (h *archiveHandler) extractorHandler(dataOrErrChan chan DataOrErr) func(con
h.metrics.observeFileSize(fileSize)

lCtx.Logger().V(4).Info("Processed file successfully", "filename", file.Name(), "size", file.Size())
return h.openArchive(lCtx, depth, rdr, dataOrErrChan)
return h.openArchive(lCtx, depth, (archiveEntryPath + "/" + file.Name()), rdr, dataOrErrChan)
joeleonjr marked this conversation as resolved.
Show resolved Hide resolved
}
}
14 changes: 13 additions & 1 deletion pkg/handlers/archive_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,60 +19,71 @@ func TestArchiveHandler(t *testing.T) {
expectedChunks int
matchString string
expectErr bool
matchFileName string
}{
"gzip-single": {
"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/one-zip.gz",
1,
"AKIAYVP4CIPPH5TNP3SW",
false,
"",
},
"gzip-nested": {
"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/double-zip.gz",
1,
"AKIAYVP4CIPPH5TNP3SW",
false,
// This is b/c we can't get file path from nested archiver.OpenReader()
"(decompressed .gz file)",
},
"gzip-too-deep": {
"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/six-zip.gz",
0,
"",
true,
"",
},
"tar-single": {
"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/one.tar",
1,
"AKIAYVP4CIPPH5TNP3SW",
false,
"/aws-canary-creds",
},
"tar-nested": {
"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/two.tar",
1,
"AKIAYVP4CIPPH5TNP3SW",
false,
"/one.tar/aws-canary-creds",
},
"tar-too-deep": {
"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/six.tar",
0,
"",
true,
"",
},
"targz-single": {
"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/tar-archive.tar.gz",
1,
"AKIAYVP4CIPPH5TNP3SW",
false,
"/aws-canary-creds",
},
"gzip-large": {
"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/FifteenMB.gz",
1543,
"AKIAYVP4CIPPH5TNP3SW",
false,
"",
},
"zip-single": {
"https://raw.githubusercontent.com/bill-rich/bad-secrets/master/aws-canary-creds.zip",
1,
"AKIAYVP4CIPPH5TNP3SW",
false,
"/aws-canary-creds",
},
}

Expand Down Expand Up @@ -104,6 +115,7 @@ func TestArchiveHandler(t *testing.T) {
count++
if re.Match(chunk.Data) {
matched = true
assert.Equal(t, chunk.ArchiveEntryPath, testCase.matchFileName)
}
}

Expand All @@ -125,6 +137,6 @@ func TestOpenInvalidArchive(t *testing.T) {

dataOrErrChan := make(chan DataOrErr)

err = handler.openArchive(ctx, 0, rdr, dataOrErrChan)
err = handler.openArchive(ctx, 0, emptyFilePath, rdr, dataOrErrChan)
assert.Error(t, err)
}
6 changes: 5 additions & 1 deletion pkg/handlers/default.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ func (h *defaultHandler) HandleFile(ctx logContext.Context, input fileReader) ch
defer close(dataOrErrChan)

start := time.Now()
err := h.handleNonArchiveContent(ctx, newMimeTypeReaderFromFileReader(input), dataOrErrChan)
err := h.handleNonArchiveContent(ctx, emptyFilePath, newMimeTypeReaderFromFileReader(input), dataOrErrChan)
if err == nil {
h.metrics.incFilesProcessed()
}
Expand Down Expand Up @@ -93,6 +93,7 @@ func (h *defaultHandler) measureLatencyAndHandleErrors(
// file content, regardless of being an archive or not, is handled appropriately.
func (h *defaultHandler) handleNonArchiveContent(
ctx logContext.Context,
archiveEntryPath string,
reader mimeTypeReader,
dataOrErrChan chan DataOrErr,
) error {
Expand All @@ -109,6 +110,9 @@ func (h *defaultHandler) handleNonArchiveContent(
chunkReader := sources.NewChunkReader()
for data := range chunkReader(ctx, reader) {
dataOrErr := DataOrErr{}
if archiveEntryPath != "" {
dataOrErr.ArchiveEntryPath = archiveEntryPath
}
if err := data.Error(); err != nil {
h.metrics.incErrors()
dataOrErr.Err = fmt.Errorf("%w: error reading chunk: %v", ErrProcessingWarning, err)
Expand Down
12 changes: 10 additions & 2 deletions pkg/handlers/handlers.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ var (
// ErrProcessingWarning indicates a recoverable error that can be logged,
// allowing processing to continue.
ErrProcessingWarning = errors.New("error processing file")

// emptyFilePath is used to represent an empty file path.
emptyFilePath = ""
)

type readerConfig struct{ fileExtension string }
Expand Down Expand Up @@ -183,8 +186,9 @@ func newFileReader(r io.Reader, options ...readerOption) (fReader fileReader, er
// efficient streaming of file contents while also providing a way to propagate errors
// that may occur during the file handling process.
type DataOrErr struct {
Data []byte
Err error
Data []byte
Err error
ArchiveEntryPath string //optional, only for archived files
}

// FileHandler represents a handler for files.
Expand Down Expand Up @@ -402,6 +406,7 @@ func HandleFile(
// - If it contains an error, the function handles it based on severity:
// - Fatal errors (context cancellation, deadline exceeded, ErrProcessingFatal) cause immediate termination
// - Non-fatal errors (ErrProcessingWarning and others) are logged and processing continues
//
// The function also listens for context cancellation to gracefully terminate processing if the context is done.
// It returns nil upon successful processing of all data, or the first encountered fatal error.
func handleChunksWithError(
Expand All @@ -428,6 +433,9 @@ func handleChunksWithError(
if len(dataOrErr.Data) > 0 {
chunk := *chunkSkel
chunk.Data = dataOrErr.Data
if dataOrErr.ArchiveEntryPath != "" {
chunk.HandleMetadata = map[string]string{"Archive Entry Path": dataOrErr.ArchiveEntryPath}
}
if err := reporter.ChunkOk(ctx, chunk); err != nil {
return fmt.Errorf("error reporting chunk: %w", err)
}
Expand Down
4 changes: 3 additions & 1 deletion pkg/handlers/rpm.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,9 @@
return fmt.Errorf("error creating mime-type reader: %w", err)
}

if err := h.handleNonArchiveContent(fileCtx, rdr, dataOrErrChan); err != nil {
// ToDo: Update processRPMFiles to accomdate nested archives. Once completed,

Check failure on line 118 in pkg/handlers/rpm.go

View workflow job for this annotation

GitHub Actions / golangci-lint

`accomdate` is a misspelling of `accommodate` (misspell)
// adjust the emptyFilePath value to reflect the actual file path.
if err := h.handleNonArchiveContent(fileCtx, emptyFilePath, rdr, dataOrErrChan); err != nil {
dataOrErrChan <- DataOrErr{
Err: fmt.Errorf("%w: error processing RPM archive: %v", ErrProcessingWarning, err),
}
Expand Down
2 changes: 2 additions & 0 deletions pkg/sources/sources.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ type Chunk struct {
SourceMetadata *source_metadatapb.MetaData
// SourceType is the type of Source that produced the chunk.
SourceType sourcespb.SourceType
// HandleMetadata holds the metadata from a handler if one was used.
HandleMetadata map[string]string
joeleonjr marked this conversation as resolved.
Show resolved Hide resolved

// Verify specifies whether any secrets in the Chunk should be verified.
Verify bool
Expand Down
Loading