diff --git a/pkg/engine/engine.go b/pkg/engine/engine.go index 9ec11dccc181..2a2c62f3bfa0 100644 --- a/pkg/engine/engine.go +++ b/pkg/engine/engine.go @@ -1155,6 +1155,17 @@ func (e *Engine) processResult( return } + // Add in handler metadata. Existing extra data is not overwritten. + if res.ExtraData == nil && data.chunk.HandleMetadata != nil { + res.ExtraData = data.chunk.HandleMetadata + } else { + for k, v := range data.chunk.HandleMetadata { + if _, ok := res.ExtraData[k]; !ok { + res.ExtraData[k] = v + } + } + } + secret := detectors.CopyMetadata(&data.chunk, res) secret.DecoderType = data.decoder secret.DetectorDescription = data.detector.Detector.Description() diff --git a/pkg/handlers/apk.go b/pkg/handlers/apk.go index 639ce3af3eb7..0b544b7e3c3e 100644 --- a/pkg/handlers/apk.go +++ b/pkg/handlers/apk.go @@ -204,7 +204,9 @@ func (h *apkHandler) handleAPKFileContent( ctx, "filename", fileName, ) - return h.handleNonArchiveContent(ctx, mimeReader, apkChan) + //Note: the *zip.File.Name attribute (passed into this function as fileName) + // includes the full path within the apk. + return h.handleNonArchiveContent(ctx, fileName, mimeReader, apkChan) } // createZipReader creates a new ZIP reader from the input fileReader. diff --git a/pkg/handlers/ar.go b/pkg/handlers/ar.go index 3f037887e52f..e691ed0167d0 100644 --- a/pkg/handlers/ar.go +++ b/pkg/handlers/ar.go @@ -104,7 +104,8 @@ func (h *arHandler) processARFiles(ctx logContext.Context, reader *deb.Ar, dataO continue } - if err := h.handleNonArchiveContent(fileCtx, rdr, dataOrErrChan); err != nil { + // Note: emptyFilePath is used as the archiveEntryPath value b/c the `ar` format is a flat archive format (no nested dirs). + if err := h.handleNonArchiveContent(fileCtx, emptyFilePath, rdr, dataOrErrChan); err != nil { dataOrErrChan <- DataOrErr{ Err: fmt.Errorf("%w: error handling archive content in AR: %v", ErrProcessingWarning, err), } diff --git a/pkg/handlers/archive.go b/pkg/handlers/archive.go index 500909704e1f..d9051b8f247c 100644 --- a/pkg/handlers/archive.go +++ b/pkg/handlers/archive.go @@ -4,6 +4,7 @@ import ( "context" "errors" "fmt" + "path/filepath" "time" "github.com/mholt/archiver/v4" @@ -86,7 +87,7 @@ func (h *archiveHandler) HandleFile(ctx logContext.Context, input fileReader) ch }() start := time.Now() - err := h.openArchive(ctx, 0, input, dataOrErrChan) + err := h.openArchive(ctx, []string{}, input, dataOrErrChan) if err == nil { h.metrics.incFilesProcessed() } @@ -101,37 +102,37 @@ func (h *archiveHandler) HandleFile(ctx logContext.Context, input fileReader) ch var ErrMaxDepthReached = errors.New("max archive depth reached") // openArchive recursively extracts content from an archive up to a maximum depth, handling nested archives if necessary. -// It takes a reader from which it attempts to identify and process the archive format. Depending on the archive type, -// it either decompresses or extracts the contents directly, sending data to the provided channel. +// It takes a string slice representing the path to the archive and a reader from which it attempts to identify and process the archive format. +// Depending on the archive type, it either decompresses or extracts the contents directly, sending data to the provided channel. // Returns an error if the archive cannot be processed due to issues like exceeding maximum depth or unsupported formats. func (h *archiveHandler) openArchive( ctx logContext.Context, - depth int, + archiveEntryPaths []string, reader fileReader, dataOrErrChan chan DataOrErr, ) error { - ctx.Logger().V(4).Info("Starting archive processing", "depth", depth) - defer ctx.Logger().V(4).Info("Finished archive processing", "depth", depth) + ctx.Logger().V(4).Info("Starting archive processing", "depth", len(archiveEntryPaths)) + defer ctx.Logger().V(4).Info("Finished archive processing", "depth", len(archiveEntryPaths)) if common.IsDone(ctx) { return ctx.Err() } - if depth >= maxDepth { + if len(archiveEntryPaths) >= maxDepth { h.metrics.incMaxArchiveDepthCount() return ErrMaxDepthReached } if reader.format == nil { - if depth > 0 { - return h.handleNonArchiveContent(ctx, newMimeTypeReaderFromFileReader(reader), dataOrErrChan) + if len(archiveEntryPaths) > 0 { + return h.handleNonArchiveContent(ctx, filepath.Join(archiveEntryPaths...), newMimeTypeReaderFromFileReader(reader), dataOrErrChan) } return fmt.Errorf("unknown archive format") } switch archive := reader.format.(type) { case archiver.Decompressor: - // Decompress tha archive and feed the decompressed data back into the archive handler to extract any nested archives. + // Decompress the archive and feed the decompressed data back into the archive handler to extract any nested archives. compReader, err := archive.OpenReader(reader) if err != nil { return fmt.Errorf("error opening decompressor with format: %s %w", reader.format.Name(), err) @@ -152,9 +153,11 @@ func (h *archiveHandler) openArchive( } defer rdr.Close() - return h.openArchive(ctx, depth+1, rdr, dataOrErrChan) + // Note: We're limited in our ability to add file names to the archiveEntryPath here, as the decompressor doesn't have access to a fileName value. + // We add a empty string so we can keep track of the archive depth. + return h.openArchive(ctx, append(archiveEntryPaths, ""), rdr, dataOrErrChan) case archiver.Extractor: - err := archive.Extract(logContext.WithValue(ctx, depthKey, depth+1), reader, nil, h.extractorHandler(dataOrErrChan)) + err := archive.Extract(ctx, reader, nil, h.extractorHandler(archiveEntryPaths, dataOrErrChan)) if err != nil { return fmt.Errorf("error extracting archive with format: %s: %w", reader.format.Name(), err) } @@ -168,7 +171,7 @@ func (h *archiveHandler) openArchive( // It logs the extraction, checks for cancellation, and decides whether to skip the file based on its name or type, // particularly for binary files if configured to skip. If the file is not skipped, it recursively calls openArchive // to handle nested archives or to continue processing based on the file's content and depth in the archive structure. -func (h *archiveHandler) extractorHandler(dataOrErrChan chan DataOrErr) func(context.Context, archiver.File) error { +func (h *archiveHandler) extractorHandler(archiveEntryPaths []string, dataOrErrChan chan DataOrErr) func(context.Context, archiver.File) error { return func(ctx context.Context, file archiver.File) error { lCtx := logContext.WithValues( logContext.AddLogger(ctx), @@ -186,11 +189,6 @@ func (h *archiveHandler) extractorHandler(dataOrErrChan chan DataOrErr) func(con return ctx.Err() } - depth := 0 - if ctxDepth, ok := ctx.Value(depthKey).(int); ok { - depth = ctxDepth - } - fileSize := file.Size() if int(fileSize) > maxSize { lCtx.Logger().V(2).Info("skipping file: size exceeds max allowed", "size", fileSize, "limit", maxSize) @@ -206,7 +204,7 @@ func (h *archiveHandler) extractorHandler(dataOrErrChan chan DataOrErr) func(con f, err := file.Open() if err != nil { - return fmt.Errorf("error opening file %s: %w", file.Name(), err) + return fmt.Errorf("error opening file %s: %w", file.NameInArchive, err) } defer f.Close() @@ -232,7 +230,7 @@ func (h *archiveHandler) extractorHandler(dataOrErrChan chan DataOrErr) func(con lCtx.Logger().V(5).Info("empty reader, skipping file") return nil } - return fmt.Errorf("error creating reader for file %s: %w", file.Name(), err) + return fmt.Errorf("error creating reader for file %s: %w", file.NameInArchive, err) } defer rdr.Close() @@ -240,6 +238,6 @@ func (h *archiveHandler) extractorHandler(dataOrErrChan chan DataOrErr) func(con h.metrics.observeFileSize(fileSize) lCtx.Logger().V(4).Info("Processed file successfully", "filename", file.Name(), "size", file.Size()) - return h.openArchive(lCtx, depth, rdr, dataOrErrChan) + return h.openArchive(lCtx, append(archiveEntryPaths, file.NameInArchive), rdr, dataOrErrChan) } } diff --git a/pkg/handlers/archive_test.go b/pkg/handlers/archive_test.go index 3463b6220c90..df6e16b120e2 100644 --- a/pkg/handlers/archive_test.go +++ b/pkg/handlers/archive_test.go @@ -19,60 +19,71 @@ func TestArchiveHandler(t *testing.T) { expectedChunks int matchString string expectErr bool + matchFileName string }{ "gzip-single": { "https://raw.githubusercontent.com/bill-rich/bad-secrets/master/one-zip.gz", 1, "AKIAYVP4CIPPH5TNP3SW", false, + "", }, "gzip-nested": { "https://raw.githubusercontent.com/bill-rich/bad-secrets/master/double-zip.gz", 1, "AKIAYVP4CIPPH5TNP3SW", false, + // This is b/c we can't get file path from nested archiver.OpenReader() + "", }, "gzip-too-deep": { "https://raw.githubusercontent.com/bill-rich/bad-secrets/master/six-zip.gz", 0, "", true, + "", }, "tar-single": { "https://raw.githubusercontent.com/bill-rich/bad-secrets/master/one.tar", 1, "AKIAYVP4CIPPH5TNP3SW", false, + "aws-canary-creds", }, "tar-nested": { "https://raw.githubusercontent.com/bill-rich/bad-secrets/master/two.tar", 1, "AKIAYVP4CIPPH5TNP3SW", false, + "one.tar/aws-canary-creds", }, "tar-too-deep": { "https://raw.githubusercontent.com/bill-rich/bad-secrets/master/six.tar", 0, "", true, + "", }, "targz-single": { "https://raw.githubusercontent.com/bill-rich/bad-secrets/master/tar-archive.tar.gz", 1, "AKIAYVP4CIPPH5TNP3SW", false, + "aws-canary-creds", }, "gzip-large": { "https://raw.githubusercontent.com/bill-rich/bad-secrets/master/FifteenMB.gz", 1543, "AKIAYVP4CIPPH5TNP3SW", false, + "", }, "zip-single": { "https://raw.githubusercontent.com/bill-rich/bad-secrets/master/aws-canary-creds.zip", 1, "AKIAYVP4CIPPH5TNP3SW", false, + "aws-canary-creds", }, } @@ -104,6 +115,7 @@ func TestArchiveHandler(t *testing.T) { count++ if re.Match(chunk.Data) { matched = true + assert.Equal(t, chunk.ArchiveEntryPath, testCase.matchFileName) } } @@ -125,6 +137,6 @@ func TestOpenInvalidArchive(t *testing.T) { dataOrErrChan := make(chan DataOrErr) - err = handler.openArchive(ctx, 0, rdr, dataOrErrChan) + err = handler.openArchive(ctx, []string{}, rdr, dataOrErrChan) assert.Error(t, err) } diff --git a/pkg/handlers/default.go b/pkg/handlers/default.go index c65ac4bfe880..1cf2b82ecc51 100644 --- a/pkg/handlers/default.go +++ b/pkg/handlers/default.go @@ -44,7 +44,7 @@ func (h *defaultHandler) HandleFile(ctx logContext.Context, input fileReader) ch defer close(dataOrErrChan) start := time.Now() - err := h.handleNonArchiveContent(ctx, newMimeTypeReaderFromFileReader(input), dataOrErrChan) + err := h.handleNonArchiveContent(ctx, emptyFilePath, newMimeTypeReaderFromFileReader(input), dataOrErrChan) if err == nil { h.metrics.incFilesProcessed() } @@ -93,6 +93,7 @@ func (h *defaultHandler) measureLatencyAndHandleErrors( // file content, regardless of being an archive or not, is handled appropriately. func (h *defaultHandler) handleNonArchiveContent( ctx logContext.Context, + archiveEntryPath string, reader mimeTypeReader, dataOrErrChan chan DataOrErr, ) error { @@ -109,6 +110,9 @@ func (h *defaultHandler) handleNonArchiveContent( chunkReader := sources.NewChunkReader() for data := range chunkReader(ctx, reader) { dataOrErr := DataOrErr{} + if archiveEntryPath != "" { + dataOrErr.ArchiveEntryPath = archiveEntryPath + } if err := data.Error(); err != nil { h.metrics.incErrors() dataOrErr.Err = fmt.Errorf("%w: error reading chunk: %v", ErrProcessingWarning, err) diff --git a/pkg/handlers/handlers.go b/pkg/handlers/handlers.go index df9505679827..a5ea8ef07235 100644 --- a/pkg/handlers/handlers.go +++ b/pkg/handlers/handlers.go @@ -52,6 +52,9 @@ var ( // ErrProcessingWarning indicates a recoverable error that can be logged, // allowing processing to continue. ErrProcessingWarning = errors.New("error processing file") + + // emptyFilePath is used to represent an empty file path. + emptyFilePath = "" ) type readerConfig struct{ fileExtension string } @@ -183,8 +186,9 @@ func newFileReader(r io.Reader, options ...readerOption) (fReader fileReader, er // efficient streaming of file contents while also providing a way to propagate errors // that may occur during the file handling process. type DataOrErr struct { - Data []byte - Err error + Data []byte + Err error + ArchiveEntryPath string //optional, only for archived files } // FileHandler represents a handler for files. @@ -402,6 +406,7 @@ func HandleFile( // - If it contains an error, the function handles it based on severity: // - Fatal errors (context cancellation, deadline exceeded, ErrProcessingFatal) cause immediate termination // - Non-fatal errors (ErrProcessingWarning and others) are logged and processing continues +// // The function also listens for context cancellation to gracefully terminate processing if the context is done. // It returns nil upon successful processing of all data, or the first encountered fatal error. func handleChunksWithError( @@ -428,6 +433,9 @@ func handleChunksWithError( if len(dataOrErr.Data) > 0 { chunk := *chunkSkel chunk.Data = dataOrErr.Data + if dataOrErr.ArchiveEntryPath != "" { + chunk.HandleMetadata = map[string]string{"Archive Entry Path": dataOrErr.ArchiveEntryPath} + } if err := reporter.ChunkOk(ctx, chunk); err != nil { return fmt.Errorf("error reporting chunk: %w", err) } diff --git a/pkg/handlers/rpm.go b/pkg/handlers/rpm.go index 3450b8749f98..c2750e8ee0b0 100644 --- a/pkg/handlers/rpm.go +++ b/pkg/handlers/rpm.go @@ -115,7 +115,9 @@ func (h *rpmHandler) processRPMFiles( return fmt.Errorf("error creating mime-type reader: %w", err) } - if err := h.handleNonArchiveContent(fileCtx, rdr, dataOrErrChan); err != nil { + // ToDo: Update processRPMFiles to accommodate nested archives. Once completed, + // adjust the emptyFilePath value to reflect the actual file path. + if err := h.handleNonArchiveContent(fileCtx, emptyFilePath, rdr, dataOrErrChan); err != nil { dataOrErrChan <- DataOrErr{ Err: fmt.Errorf("%w: error processing RPM archive: %v", ErrProcessingWarning, err), } diff --git a/pkg/sources/sources.go b/pkg/sources/sources.go index b1190209cf7d..874ce8b6cc23 100644 --- a/pkg/sources/sources.go +++ b/pkg/sources/sources.go @@ -40,6 +40,8 @@ type Chunk struct { SourceMetadata *source_metadatapb.MetaData // SourceType is the type of Source that produced the chunk. SourceType sourcespb.SourceType + // HandleMetadata holds the metadata from a handler if one was used. + HandleMetadata map[string]string // Verify specifies whether any secrets in the Chunk should be verified. Verify bool