@@ -2,6 +2,7 @@ package engine
2
2
3
3
import (
4
4
"bytes"
5
+ "fmt"
5
6
"reflect"
6
7
"runtime"
7
8
"strings"
@@ -233,6 +234,22 @@ func (e *Engine) BytesScanned() uint64 {
233
234
return e .bytesScanned
234
235
}
235
236
237
+ func (e * Engine ) dedupeAndSend (chunkResults []detectors.ResultWithMetadata ) {
238
+ dedupeMap := make (map [string ]struct {})
239
+ for _ , result := range chunkResults {
240
+ // dedupe by comparing the detector type, raw result, and source metadata
241
+ // NOTE: in order for the PLAIN decoder to maintain precedence, make sure UTF8 is the first decoder in the
242
+ // default decoders list
243
+ key := fmt .Sprintf ("%s%s%s%+v" , result .DetectorType .String (), result .Raw , result .RawV2 , result .SourceMetadata )
244
+ if _ , ok := dedupeMap [key ]; ok {
245
+ continue
246
+ }
247
+ dedupeMap [key ] = struct {}{}
248
+ e .results <- result
249
+ }
250
+
251
+ }
252
+
236
253
func (e * Engine ) DetectorAvgTime () map [string ][]time.Duration {
237
254
logger := context .Background ().Logger ()
238
255
avgTime := map [string ][]time.Duration {}
@@ -257,6 +274,7 @@ func (e *Engine) DetectorAvgTime() map[string][]time.Duration {
257
274
func (e * Engine ) detectorWorker (ctx context.Context ) {
258
275
for originalChunk := range e .chunks {
259
276
for chunk := range sources .Chunker (originalChunk ) {
277
+ var chunkResults []detectors.ResultWithMetadata
260
278
matchedKeywords := make (map [string ]struct {})
261
279
atomic .AddUint64 (& e .bytesScanned , uint64 (len (chunk .Data )))
262
280
for _ , decoder := range e .decoders {
@@ -273,21 +291,12 @@ func (e *Engine) detectorWorker(ctx context.Context) {
273
291
decoderType = detectorspb .DecoderType_UNKNOWN
274
292
}
275
293
276
- original := chunk .Data
277
294
decoded := decoder .FromChunk (chunk )
278
295
279
296
if decoded == nil {
280
297
continue
281
298
}
282
299
283
- if decoded == nil ||
284
- // check if the decoded data is similar "enough" to the original data. If it is, then we can skip scanning the decoded data as
285
- // it's likely already picked up by the PLAIN decoder. See related issue: https://github.com/trufflesecurity/trufflehog/issues/1450
286
- (decoded != nil &&
287
- decoderType == detectorspb .DecoderType_BASE64 && common .BytesEqual (original , decoded .Data , 40 )) {
288
- continue
289
- }
290
-
291
300
// build a map of all keywords that were matched in the chunk
292
301
for _ , m := range e .prefilter .FindAll (string (decoded .Data )) {
293
302
matchedKeywords [strings .ToLower (string (decoded .Data [m .Start ():m .End ()]))] = struct {}{}
@@ -343,7 +352,7 @@ func (e *Engine) detectorWorker(ctx context.Context) {
343
352
continue
344
353
}
345
354
result .DecoderType = decoderType
346
- e . results <- detectors .CopyMetadata (resultChunk , result )
355
+ chunkResults = append ( chunkResults , detectors .CopyMetadata (resultChunk , result ) )
347
356
348
357
}
349
358
if len (results ) > 0 {
@@ -363,6 +372,7 @@ func (e *Engine) detectorWorker(ctx context.Context) {
363
372
}
364
373
}
365
374
}
375
+ e .dedupeAndSend (chunkResults )
366
376
}
367
377
atomic .AddUint64 (& e .chunksScanned , 1 )
368
378
}
0 commit comments