Skip to content

Commit b48ac24

Browse files
authored
Dedupe results (#1479)
* init 4 dedupin * use raw rather than rawv2 * rm comment * comments * nits * clean up and use rawv2 too * add decoder order test
1 parent a123d5c commit b48ac24

File tree

3 files changed

+31
-10
lines changed

3 files changed

+31
-10
lines changed

pkg/decoders/decoders.go

+1
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66

77
func DefaultDecoders() []Decoder {
88
return []Decoder{
9+
// UTF8 must be first for duplicate detection
910
&UTF8{},
1011
&Base64{},
1112
&UTF16{},

pkg/engine/engine.go

+20-10
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package engine
22

33
import (
44
"bytes"
5+
"fmt"
56
"reflect"
67
"runtime"
78
"strings"
@@ -233,6 +234,22 @@ func (e *Engine) BytesScanned() uint64 {
233234
return e.bytesScanned
234235
}
235236

237+
func (e *Engine) dedupeAndSend(chunkResults []detectors.ResultWithMetadata) {
238+
dedupeMap := make(map[string]struct{})
239+
for _, result := range chunkResults {
240+
// dedupe by comparing the detector type, raw result, and source metadata
241+
// NOTE: in order for the PLAIN decoder to maintain precedence, make sure UTF8 is the first decoder in the
242+
// default decoders list
243+
key := fmt.Sprintf("%s%s%s%+v", result.DetectorType.String(), result.Raw, result.RawV2, result.SourceMetadata)
244+
if _, ok := dedupeMap[key]; ok {
245+
continue
246+
}
247+
dedupeMap[key] = struct{}{}
248+
e.results <- result
249+
}
250+
251+
}
252+
236253
func (e *Engine) DetectorAvgTime() map[string][]time.Duration {
237254
logger := context.Background().Logger()
238255
avgTime := map[string][]time.Duration{}
@@ -257,6 +274,7 @@ func (e *Engine) DetectorAvgTime() map[string][]time.Duration {
257274
func (e *Engine) detectorWorker(ctx context.Context) {
258275
for originalChunk := range e.chunks {
259276
for chunk := range sources.Chunker(originalChunk) {
277+
var chunkResults []detectors.ResultWithMetadata
260278
matchedKeywords := make(map[string]struct{})
261279
atomic.AddUint64(&e.bytesScanned, uint64(len(chunk.Data)))
262280
for _, decoder := range e.decoders {
@@ -273,21 +291,12 @@ func (e *Engine) detectorWorker(ctx context.Context) {
273291
decoderType = detectorspb.DecoderType_UNKNOWN
274292
}
275293

276-
original := chunk.Data
277294
decoded := decoder.FromChunk(chunk)
278295

279296
if decoded == nil {
280297
continue
281298
}
282299

283-
if decoded == nil ||
284-
// check if the decoded data is similar "enough" to the original data. If it is, then we can skip scanning the decoded data as
285-
// it's likely already picked up by the PLAIN decoder. See related issue: https://github.com/trufflesecurity/trufflehog/issues/1450
286-
(decoded != nil &&
287-
decoderType == detectorspb.DecoderType_BASE64 && common.BytesEqual(original, decoded.Data, 40)) {
288-
continue
289-
}
290-
291300
// build a map of all keywords that were matched in the chunk
292301
for _, m := range e.prefilter.FindAll(string(decoded.Data)) {
293302
matchedKeywords[strings.ToLower(string(decoded.Data[m.Start():m.End()]))] = struct{}{}
@@ -343,7 +352,7 @@ func (e *Engine) detectorWorker(ctx context.Context) {
343352
continue
344353
}
345354
result.DecoderType = decoderType
346-
e.results <- detectors.CopyMetadata(resultChunk, result)
355+
chunkResults = append(chunkResults, detectors.CopyMetadata(resultChunk, result))
347356

348357
}
349358
if len(results) > 0 {
@@ -363,6 +372,7 @@ func (e *Engine) detectorWorker(ctx context.Context) {
363372
}
364373
}
365374
}
375+
e.dedupeAndSend(chunkResults)
366376
}
367377
atomic.AddUint64(&e.chunksScanned, 1)
368378
}

pkg/engine/engine_test.go

+10
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package engine
33
import (
44
"testing"
55

6+
"github.com/trufflesecurity/trufflehog/v3/pkg/decoders"
67
"github.com/trufflesecurity/trufflehog/v3/pkg/detectors"
78
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
89
)
@@ -62,3 +63,12 @@ func TestFragmentLineOffset(t *testing.T) {
6263
})
6364
}
6465
}
66+
67+
// Test to make sure that DefaultDecoders always returns the UTF8 decoder first.
68+
// Technically a decoder test but we want this to run and fail in CI
69+
func TestDefaultDecoders(t *testing.T) {
70+
ds := decoders.DefaultDecoders()
71+
if _, ok := ds[0].(*decoders.UTF8); !ok {
72+
t.Errorf("DefaultDecoders() = %v, expected UTF8 decoder to be first", ds)
73+
}
74+
}

0 commit comments

Comments
 (0)