Skip to content

Commit 766d8b7

Browse files
committed
feat(decoders): HTML entities
1 parent 21fbe08 commit 766d8b7

15 files changed

+1407
-1063
lines changed

hack/snifftest/main.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ func main() {
122122
for chunk := range chunksChan {
123123
for name, scanner := range selectedScanners {
124124
for _, dec := range allDecoders {
125-
decoded := dec.FromChunk(&sources.Chunk{Data: chunk.Data})
125+
decoded := dec.FromChunk(ctx, &sources.Chunk{Data: chunk.Data})
126126
if decoded != nil {
127127
foundKeyword := false
128128
for _, kw := range scanner.Keywords() {

pkg/decoders/base64.go

+2-1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"encoding/base64"
66
"unicode"
77

8+
"github.com/trufflesecurity/trufflehog/v3/pkg/context"
89
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
910
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
1011
)
@@ -31,7 +32,7 @@ func (d *Base64) Type() detectorspb.DecoderType {
3132
return detectorspb.DecoderType_BASE64
3233
}
3334

34-
func (d *Base64) FromChunk(chunk *sources.Chunk) *DecodableChunk {
35+
func (d *Base64) FromChunk(_ context.Context, chunk *sources.Chunk) *DecodableChunk {
3536
decodableChunk := &DecodableChunk{Chunk: chunk, DecoderType: d.Type()}
3637
encodedSubstrings := getSubstringsOfCharacterSet(chunk.Data, 20, b64CharsetMapping, b64EndChars)
3738
decodedSubstrings := make(map[string][]byte)

pkg/decoders/base64_test.go

+5-4
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55

66
"github.com/kylelemons/godebug/pretty"
77

8+
"github.com/trufflesecurity/trufflehog/v3/pkg/context"
89
"github.com/trufflesecurity/trufflehog/v3/pkg/detectors"
910
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
1011
)
@@ -134,7 +135,7 @@ func TestBase64_FromChunk(t *testing.T) {
134135
for _, tt := range tests {
135136
t.Run(tt.name, func(t *testing.T) {
136137
d := &Base64{}
137-
got := d.FromChunk(tt.chunk)
138+
got := d.FromChunk(context.Background(), tt.chunk)
138139
if tt.want != nil {
139140
if got == nil {
140141
t.Fatal("got nil, did not want nil")
@@ -156,7 +157,7 @@ func BenchmarkFromChunkSmall(b *testing.B) {
156157
data := detectors.MustGetBenchmarkData()["small"]
157158

158159
for n := 0; n < b.N; n++ {
159-
d.FromChunk(&sources.Chunk{Data: data})
160+
d.FromChunk(context.Background(), &sources.Chunk{Data: data})
160161
}
161162
}
162163

@@ -165,7 +166,7 @@ func BenchmarkFromChunkMedium(b *testing.B) {
165166
data := detectors.MustGetBenchmarkData()["medium"]
166167

167168
for n := 0; n < b.N; n++ {
168-
d.FromChunk(&sources.Chunk{Data: data})
169+
d.FromChunk(context.Background(), &sources.Chunk{Data: data})
169170
}
170171
}
171172

@@ -174,6 +175,6 @@ func BenchmarkFromChunkLarge(b *testing.B) {
174175
data := detectors.MustGetBenchmarkData()["big"]
175176

176177
for n := 0; n < b.N; n++ {
177-
d.FromChunk(&sources.Chunk{Data: data})
178+
d.FromChunk(context.Background(), &sources.Chunk{Data: data})
178179
}
179180
}

pkg/decoders/decoders.go

+5-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package decoders
22

33
import (
4+
"github.com/trufflesecurity/trufflehog/v3/pkg/context"
45
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
56
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
67
)
@@ -12,6 +13,7 @@ func DefaultDecoders() []Decoder {
1213
&Base64{},
1314
&UTF16{},
1415
&EscapedUnicode{},
16+
&HtmlEntity{},
1517
}
1618
}
1719

@@ -23,21 +25,22 @@ type DecodableChunk struct {
2325
}
2426

2527
type Decoder interface {
26-
FromChunk(chunk *sources.Chunk) *DecodableChunk
28+
FromChunk(ctx context.Context, chunk *sources.Chunk) *DecodableChunk
2729
Type() detectorspb.DecoderType
2830
}
2931

3032
// Fuzz is an entrypoint for go-fuzz, which is an AFL-style fuzzing tool.
3133
// This one attempts to uncover any panics during decoding.
3234
func Fuzz(data []byte) int {
3335
decoded := false
36+
ctx := context.Background()
3437
for i, decoder := range DefaultDecoders() {
3538
// Skip the first decoder (plain), because it will always decode and give
3639
// priority to the input (return 1).
3740
if i == 0 {
3841
continue
3942
}
40-
chunk := decoder.FromChunk(&sources.Chunk{Data: data})
43+
chunk := decoder.FromChunk(ctx, &sources.Chunk{Data: data})
4144
if chunk != nil {
4245
decoded = true
4346
}

pkg/decoders/escaped_unicode.go

+3-2
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ import (
66
"strconv"
77
"unicode/utf8"
88

9+
"github.com/trufflesecurity/trufflehog/v3/pkg/context"
910
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
1011
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
1112
)
@@ -18,7 +19,7 @@ var _ Decoder = (*EscapedUnicode)(nil)
1819
// https://dencode.com/en/string/unicode-escape
1920
var (
2021
// Standard Unicode notation.
21-
//https://unicode.org/standard/principles.html
22+
// https://unicode.org/standard/principles.html
2223
codePointPat = regexp.MustCompile(`\bU\+([a-fA-F0-9]{4}).?`)
2324

2425
// Common escape sequence used in programming languages.
@@ -29,7 +30,7 @@ func (d *EscapedUnicode) Type() detectorspb.DecoderType {
2930
return detectorspb.DecoderType_ESCAPED_UNICODE
3031
}
3132

32-
func (d *EscapedUnicode) FromChunk(chunk *sources.Chunk) *DecodableChunk {
33+
func (d *EscapedUnicode) FromChunk(_ context.Context, chunk *sources.Chunk) *DecodableChunk {
3334
if chunk == nil || len(chunk.Data) == 0 {
3435
return nil
3536
}

pkg/decoders/escaped_unicode_test.go

+2-1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55

66
"github.com/kylelemons/godebug/pretty"
77

8+
"github.com/trufflesecurity/trufflehog/v3/pkg/context"
89
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
910
)
1011

@@ -68,7 +69,7 @@ func TestUnicodeEscape_FromChunk(t *testing.T) {
6869
for _, tt := range tests {
6970
t.Run(tt.name, func(t *testing.T) {
7071
d := &EscapedUnicode{}
71-
got := d.FromChunk(tt.chunk)
72+
got := d.FromChunk(context.Background(), tt.chunk)
7273
if tt.want != nil {
7374
if got == nil {
7475
t.Fatal("got nil, did not want nil")

pkg/decoders/html_entity.go

+219
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,219 @@
1+
package decoders
2+
3+
import (
4+
"bytes"
5+
"errors"
6+
"regexp"
7+
"strconv"
8+
"strings"
9+
"sync"
10+
11+
ahocorasick "github.com/BobuSumisu/aho-corasick"
12+
"github.com/go-logr/logr"
13+
"golang.org/x/exp/maps"
14+
15+
"github.com/trufflesecurity/trufflehog/v3/pkg/context"
16+
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
17+
"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
18+
)
19+
20+
// HtmlEntity decodes characters that are encoded as decimal, hexadecimal, or named entities.
21+
// https://www.ee.ucl.ac.uk/~mflanaga/java/HTMLandASCIItableC1.html
22+
type HtmlEntity struct{}
23+
24+
var (
25+
_ Decoder = (*HtmlEntity)(nil)
26+
27+
once sync.Once
28+
htmlTrie *ahocorasick.Trie
29+
)
30+
31+
func init() {
32+
// Use Aho-Corasick to pre-filter potential matches.
33+
once.Do(func() {
34+
keywords := map[string]struct{}{
35+
`&#`: {}, // decimal
36+
`&#x`: {}, // hex
37+
}
38+
for entity := range namedEntityMap {
39+
keywords[strings.ToLower(entity)] = struct{}{}
40+
}
41+
htmlTrie = ahocorasick.NewTrieBuilder().AddStrings(maps.Keys(keywords)).Build()
42+
})
43+
}
44+
45+
func (d *HtmlEntity) Type() detectorspb.DecoderType {
46+
return detectorspb.DecoderType_HTML
47+
}
48+
49+
func (d *HtmlEntity) FromChunk(ctx context.Context, chunk *sources.Chunk) *DecodableChunk {
50+
if chunk == nil || len(chunk.Data) == 0 {
51+
return nil
52+
} else if m := htmlTrie.MatchFirst(chunk.Data); m == nil {
53+
return nil
54+
}
55+
56+
var (
57+
logger = ctx.Logger().WithName("decoders.html")
58+
// Necessary to avoid data races.
59+
chunkData = bytes.Clone(chunk.Data)
60+
matched = false
61+
)
62+
if namedEntityPat.Match(chunkData) {
63+
matched = true
64+
chunkData = decodeNamedEntities(logger, chunkData)
65+
}
66+
if decimalEntityPat.Match(chunkData) {
67+
matched = true
68+
chunkData = decodeHtmlDecimal(logger, chunkData)
69+
}
70+
if hexEntityPat.Match(chunkData) {
71+
matched = true
72+
chunkData = decodeHtmlHex(logger, chunkData)
73+
}
74+
75+
if matched {
76+
return &DecodableChunk{
77+
DecoderType: d.Type(),
78+
Chunk: &sources.Chunk{
79+
Data: chunkData,
80+
SourceName: chunk.SourceName,
81+
SourceID: chunk.SourceID,
82+
JobID: chunk.JobID,
83+
SecretID: chunk.SecretID,
84+
SourceMetadata: chunk.SourceMetadata,
85+
SourceType: chunk.SourceType,
86+
Verify: chunk.Verify,
87+
},
88+
}
89+
} else {
90+
return nil
91+
}
92+
}
93+
94+
// `A` = `&#65;`
95+
var decimalEntityPat = regexp.MustCompile(`&#(\d{1,3});`)
96+
97+
func decodeHtmlDecimal(logger logr.Logger, input []byte) []byte {
98+
decoded := make([]byte, 0, len(input))
99+
lastIndex := 0
100+
101+
for _, match := range decimalEntityPat.FindAllSubmatchIndex(input, -1) {
102+
startIndex := match[0]
103+
endIndex := match[1]
104+
decStartIndex := match[2]
105+
decEndIndex := match[3]
106+
107+
// Copy the part of the input until the start of the entity
108+
decoded = append(decoded, input[lastIndex:startIndex]...)
109+
110+
num, err := strconv.Atoi(string(input[decStartIndex:decEndIndex]))
111+
if err != nil {
112+
continue
113+
}
114+
115+
// Append the decoded byte
116+
if num < 0 || num > 255 {
117+
logger.Error(errors.New("invalid decimal byte"), "Unable to decode HTML entity", "match", input[decStartIndex:decEndIndex], "byte", num)
118+
continue
119+
}
120+
decoded = append(decoded, byte(num))
121+
lastIndex = endIndex
122+
}
123+
124+
// Append the remaining part of the input
125+
decoded = append(decoded, input[lastIndex:]...)
126+
127+
return decoded
128+
}
129+
130+
// `A` = `&#x1;`
131+
var hexEntityPat = regexp.MustCompile(`(?i)&#x([a-f0-9]{1,2});`)
132+
133+
func decodeHtmlHex(logger logr.Logger, input []byte) []byte {
134+
decoded := make([]byte, 0, len(input))
135+
lastIndex := 0
136+
137+
for _, match := range hexEntityPat.FindAllSubmatchIndex(input, -1) {
138+
startIndex := match[0]
139+
endIndex := match[1]
140+
hexStartIndex := match[2]
141+
hexEndIndex := match[3]
142+
143+
// Copy the part of the input until the start of the entity
144+
decoded = append(decoded, input[lastIndex:startIndex]...)
145+
146+
// Parse the hexadecimal value to an integer
147+
num, err := strconv.ParseInt(string(input[hexStartIndex:hexEndIndex]), 16, 32)
148+
if err != nil {
149+
continue
150+
}
151+
152+
// Append the decoded byte
153+
if num < 0 || num > 255 {
154+
logger.Error(errors.New("invalid hex byte"), "Unable to decode HTML entity", "match", input[hexStartIndex:hexEndIndex], "byte", num)
155+
continue
156+
}
157+
decoded = append(decoded, byte(num))
158+
159+
lastIndex = endIndex
160+
}
161+
162+
// Append the remaining part of the input
163+
decoded = append(decoded, input[lastIndex:]...)
164+
165+
return decoded
166+
}
167+
168+
var (
169+
// https://www.compart.com/en/unicode/html
170+
namedEntityMap = map[string][]byte{
171+
"&tab;": []byte(" "),
172+
"&newline;": []byte("\n"),
173+
"&excl;": []byte("!"),
174+
"&quot;": []byte(`"`),
175+
"&num;": []byte("#"),
176+
"&dollar;": []byte("$"),
177+
"&percnt;": []byte("%"),
178+
"&amp;": []byte("&"),
179+
"&apos;": []byte("'"),
180+
"&lpar;": []byte("("),
181+
"&rpar;": []byte(")"),
182+
"&ast;": []byte("*"),
183+
"&plus;": []byte("+"),
184+
"&comma;": []byte(","),
185+
"&period;": []byte("."),
186+
"&sol;": []byte("/"),
187+
"&colon;": []byte(":"),
188+
"&semi;": []byte(";"),
189+
"&lt;": []byte("<"),
190+
"&equals;": []byte("="),
191+
"&gt;": []byte(">"),
192+
"&quest;": []byte("?"),
193+
"&commat;": []byte("@"),
194+
"&lsqb;": []byte("["),
195+
"&bsol;": []byte("\\"),
196+
"&rsqb;": []byte("]"),
197+
"&hat;": []byte("^"),
198+
"&underbar;": []byte("_"),
199+
"&diacriticalgrave;": []byte("`"),
200+
"&lcub;": []byte("{"),
201+
"&verticalline;": []byte("|"),
202+
"&rcub;": []byte("}"),
203+
"&nonbreakingspace;": []byte(" "),
204+
}
205+
namedEntityPat = func() *regexp.Regexp {
206+
return regexp.MustCompile(
207+
"(?i)(" + strings.Join(maps.Keys(namedEntityMap), "|") + ")")
208+
}()
209+
)
210+
211+
func decodeNamedEntities(_ logr.Logger, input []byte) []byte {
212+
return namedEntityPat.ReplaceAllFunc(input, func(match []byte) []byte {
213+
m := strings.ToLower(string(match))
214+
if replacement, ok := namedEntityMap[m]; ok {
215+
return replacement
216+
}
217+
return match
218+
})
219+
}

0 commit comments

Comments
 (0)