Skip to content

Commit 6a4856c

Browse files
authored
feat(scrapingbee): tweak detections (#3820)
1 parent 827a201 commit 6a4856c

File tree

2 files changed

+167
-53
lines changed

2 files changed

+167
-53
lines changed

pkg/detectors/scrapingbee/scrapingbee.go

+56-32
Original file line numberDiff line numberDiff line change
@@ -2,74 +2,98 @@ package scrapingbee
22

33
import (
44
"context"
5-
regexp "github.com/wasilibs/go-re2"
5+
"fmt"
6+
"io"
67
"net/http"
7-
"strings"
8+
9+
regexp "github.com/wasilibs/go-re2"
810

911
"github.com/trufflesecurity/trufflehog/v3/pkg/common"
1012
"github.com/trufflesecurity/trufflehog/v3/pkg/detectors"
1113
"github.com/trufflesecurity/trufflehog/v3/pkg/pb/detectorspb"
1214
)
1315

14-
type Scanner struct{}
16+
type Scanner struct {
17+
client *http.Client
18+
}
1519

1620
// Ensure the Scanner satisfies the interface at compile time.
1721
var _ detectors.Detector = (*Scanner)(nil)
1822

19-
var (
20-
client = common.SaneHttpClient()
23+
func (s Scanner) Type() detectorspb.DetectorType {
24+
return detectorspb.DetectorType_ScrapingBee
25+
}
2126

22-
// Make sure that your group is surrounded in boundary characters such as below to reduce false positives.
23-
keyPat = regexp.MustCompile(detectors.PrefixRegex([]string{"scrapingbee"}) + `\b([A-Z0-9]{80})\b`)
24-
)
27+
func (s Scanner) Description() string {
28+
return "ScrapingBee is a web scraping service that handles headless browsers and proxies for you. ScrapingBee API keys can be used to access and control web scraping tasks."
29+
}
2530

2631
// Keywords are used for efficiently pre-filtering chunks.
2732
// Use identifiers in the secret preferably, or the provider name.
2833
func (s Scanner) Keywords() []string {
29-
return []string{"scrapingbee"}
34+
return []string{"scrapingbee", "scraping bee", "scraping-bee", "scraping_bee"}
3035
}
3136

37+
var (
38+
keyPat = regexp.MustCompile(detectors.PrefixRegex([]string{"scraping[ _-]?bee"}) + `\b([A-Z0-9]{80})\b`)
39+
)
40+
3241
// FromData will find and optionally verify ScrapingBee secrets in a given set of bytes.
3342
func (s Scanner) FromData(ctx context.Context, verify bool, data []byte) (results []detectors.Result, err error) {
3443
dataStr := string(data)
3544

36-
matches := keyPat.FindAllStringSubmatch(dataStr, -1)
37-
38-
for _, match := range matches {
39-
if len(match) != 2 {
45+
uniqueMatches := make(map[string]struct{})
46+
for _, match := range keyPat.FindAllStringSubmatch(dataStr, -1) {
47+
m := match[1]
48+
if detectors.StringShannonEntropy(m) < 3.5 {
4049
continue
4150
}
42-
resMatch := strings.TrimSpace(match[1])
51+
uniqueMatches[m] = struct{}{}
52+
}
4353

44-
s1 := detectors.Result{
54+
for key := range uniqueMatches {
55+
r := detectors.Result{
4556
DetectorType: detectorspb.DetectorType_ScrapingBee,
46-
Raw: []byte(resMatch),
57+
Raw: []byte(key),
4758
}
4859

4960
if verify {
50-
req, err := http.NewRequestWithContext(ctx, "GET", "https://app.scrapingbee.com/api/v1/?api_key="+resMatch+"&url=https://httpbin.org/anything?json&render_js=false", nil)
51-
if err != nil {
52-
continue
53-
}
54-
res, err := client.Do(req)
55-
if err == nil {
56-
defer res.Body.Close()
57-
if res.StatusCode >= 200 && res.StatusCode < 300 {
58-
s1.Verified = true
59-
}
61+
if s.client == nil {
62+
s.client = common.SaneHttpClient()
6063
}
64+
65+
isVerified, verificationErr := verifyMatch(ctx, s.client, key)
66+
r.Verified = isVerified
67+
r.SetVerificationError(verificationErr, key)
6168
}
6269

63-
results = append(results, s1)
70+
results = append(results, r)
6471
}
6572

6673
return results, nil
6774
}
6875

69-
func (s Scanner) Type() detectorspb.DetectorType {
70-
return detectorspb.DetectorType_ScrapingBee
71-
}
76+
func verifyMatch(ctx context.Context, client *http.Client, key string) (bool, error) {
77+
req, err := http.NewRequestWithContext(ctx, http.MethodGet, "https://app.scrapingbee.com/api/v1/?api_key="+key+"&url=https://httpbin.org/anything?json&render_js=false", nil)
78+
if err != nil {
79+
return false, err
80+
}
7281

73-
func (s Scanner) Description() string {
74-
return "ScrapingBee is a web scraping service that handles headless browsers and proxies for you. ScrapingBee API keys can be used to access and control web scraping tasks."
82+
res, err := client.Do(req)
83+
if err != nil {
84+
return false, err
85+
}
86+
defer func() {
87+
_, _ = io.Copy(io.Discard, res.Body)
88+
_ = res.Body.Close()
89+
}()
90+
91+
switch res.StatusCode {
92+
case http.StatusOK:
93+
return true, nil
94+
case http.StatusUnauthorized:
95+
return false, nil
96+
default:
97+
return false, fmt.Errorf("unexpected status code: %d", res.StatusCode)
98+
}
7599
}

pkg/detectors/scrapingbee/scrapingbee_test.go

+111-21
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@ package scrapingbee
22

33
import (
44
"context"
5-
"fmt"
65
"testing"
76

87
"github.com/google/go-cmp/cmp"
@@ -11,42 +10,133 @@ import (
1110
"github.com/trufflesecurity/trufflehog/v3/pkg/engine/ahocorasick"
1211
)
1312

14-
var (
15-
validPattern = "HOLTCTPVL3V35NK795FY6INWQKTTTSGL6W26JKHLC3CGUNJ1DV35SWPUAZ26BUX2EEZZ7T2SJBOL8WZB"
16-
invalidPattern = "HOLT?TPVL3V35NK795FY6INWQKTTTSGL6W26JKHLC3CGUNJ1DV35SWPUAZ26BUX2EEZZ7T2SJBOL8WZB"
17-
keyword = "scrapingbee"
18-
)
19-
2013
func TestScrapingBee_Pattern(t *testing.T) {
21-
d := Scanner{}
22-
ahoCorasickCore := ahocorasick.NewAhoCorasickCore([]detectors.Detector{d})
2314
tests := []struct {
2415
name string
2516
input string
2617
want []string
2718
}{
19+
// True positives
20+
{
21+
name: `valid_query_param`,
22+
input: ` #CHANGE API KEY TO CURRENT API KEY ON SCRAPINGBEE BELOW:
23+
uri = URI("https://app.scrapingbee.com/api/v1/?api_key=VNC7VJ04BQLZWL821KJ4ZLG17ON45K4Y56P59QZMDNZBWRFAS0LIK47I3KFH6AMLUXPHIUIFBDOMIOUE&url=#{url}&stealth_proxy=True&country_code=sg&wait_browser=networkidle2&json_response=True&block_resources=False&block_ads=True&js_scenario=" + CGI.escape(js_scenario))`,
24+
want: []string{`VNC7VJ04BQLZWL821KJ4ZLG17ON45K4Y56P59QZMDNZBWRFAS0LIK47I3KFH6AMLUXPHIUIFBDOMIOUE`},
25+
},
26+
{
27+
name: `valid_function_comment`,
28+
input: `func connectToScrapingBee() {
29+
// API KEY = M977YHXCMPJJ569DSB0B8KSKL9NRU2O2327MIDT55785T8LS9TJGDW4GFMCMOZNRVN3GPSXF0Y6DGC32`,
30+
want: []string{`M977YHXCMPJJ569DSB0B8KSKL9NRU2O2327MIDT55785T8LS9TJGDW4GFMCMOZNRVN3GPSXF0Y6DGC32`},
31+
},
2832
{
29-
name: "valid pattern - with keyword scrapingbee",
30-
input: fmt.Sprintf("%s token = '%s'", keyword, validPattern),
31-
want: []string{validPattern},
33+
name: `valid_csharp`,
34+
input: ` class test{
35+
36+
string BASE_URL = @"https://app.scrapingbee.com/api/v1/";
37+
string API_KEY = "2OZ3HPYEUP9LVCN9TSMBEP5OU0C65AXL7MDO76VPYQNVAJW8NU0QUQQPEV7C51XQDLZUUYKZ5TAW2L85";
38+
39+
public static string Get(string url)`,
40+
want: []string{`2OZ3HPYEUP9LVCN9TSMBEP5OU0C65AXL7MDO76VPYQNVAJW8NU0QUQQPEV7C51XQDLZUUYKZ5TAW2L85`},
41+
},
42+
{
43+
name: `valid_js1`,
44+
input: ` const options = {
45+
uri: "https://app.scrapingbee.com/api/v1?",
46+
api_key: "34TOQQ77QJALLR07ISPYL4B5EYHW3YLU5GM97GQOCA32BVW3S0S6RTVFCZGTHZ1Q5MHH1Z9GZ0B640LI",
47+
};`,
48+
want: []string{`34TOQQ77QJALLR07ISPYL4B5EYHW3YLU5GM97GQOCA32BVW3S0S6RTVFCZGTHZ1Q5MHH1Z9GZ0B640LI`},
3249
},
3350
{
34-
name: "valid pattern - ignore duplicate",
35-
input: fmt.Sprintf("%s token = '%s' | '%s'", keyword, validPattern, validPattern),
36-
want: []string{validPattern},
51+
name: `valid_js2`,
52+
input: ` useEffect(() => {
53+
setLoading(true)
54+
base.get('https://app.scrapingbee.com/api/v1', {
55+
params:{'api_key':'BYZCNNS0SOZCPC4EXD5SXSH0PWAXPWFMZ4SXVEQNEDMKSGBP57K31PJ44V46344XCYN7IARKQWLS0V3X',
56+
'url': 'https://www.flipkart.com/search?q=${searchItem}',
57+
'block_resources': 'false',
58+
}
59+
}).then((response) => {`,
60+
want: []string{`BYZCNNS0SOZCPC4EXD5SXSH0PWAXPWFMZ4SXVEQNEDMKSGBP57K31PJ44V46344XCYN7IARKQWLS0V3X`},
3761
},
3862
{
39-
name: "valid pattern - key out of prefix range",
40-
input: fmt.Sprintf("%s keyword is not close to the real key in the data\n = '%s'", keyword, validPattern),
41-
want: []string{},
63+
name: `valid_js3`,
64+
input: `const scrapingBeeApiKey =
65+
"P5IS953T7OYL5KJG8J3SVPAV5VUJ49L2OXB7HIQDVL8SSG7O9A3J6DQ6CTK65KEAM7L7MQJIEW20ZOCP"; // Replace 'YOUR_SCRAPING_BEE_API_KEY' with your actual API key`,
66+
want: []string{`P5IS953T7OYL5KJG8J3SVPAV5VUJ49L2OXB7HIQDVL8SSG7O9A3J6DQ6CTK65KEAM7L7MQJIEW20ZOCP`},
4267
},
4368
{
44-
name: "invalid pattern",
45-
input: fmt.Sprintf("%s = '%s'", keyword, invalidPattern),
46-
want: []string{},
69+
name: `valid_php`,
70+
input: `// Set base url & API key
71+
$BASE_URL = "https://app.scrapingbee.com/api/v1/?";
72+
$API_KEY = "R4EEK5MWM2GXNK1TZUU9Z0EBA29ZUW7PW12MHI4T1BHSR7GM1G37C5BL2NHLPWC0J6VOQWP5IZJ15QV8";
73+
`,
74+
want: []string{`R4EEK5MWM2GXNK1TZUU9Z0EBA29ZUW7PW12MHI4T1BHSR7GM1G37C5BL2NHLPWC0J6VOQWP5IZJ15QV8`},
75+
},
76+
{
77+
name: `valid_python_sdk`,
78+
input: `client = ScrapingBeeClient(api_key='MZ13G1AVV8C5MEYVOIMIGJEPUH0PBSJPYTCO6IUWRZS3BXNOLA4TUP27ZGQ97LS8NRBCO66WF3ZUKSFX')`,
79+
want: []string{`MZ13G1AVV8C5MEYVOIMIGJEPUH0PBSJPYTCO6IUWRZS3BXNOLA4TUP27ZGQ97LS8NRBCO66WF3ZUKSFX`},
80+
},
81+
{
82+
name: `valid_python_sdk_newline`,
83+
input: `def main():
84+
client = ScrapingBeeClient(
85+
api_key='E1PJA1D78TBTM320Z8O9XS2MTWHTCL1NSJXGFKIZO6TJB4XIM94OSR6KQNU415QB97MYJEP6T3O0IWR3')`,
86+
87+
want: []string{`E1PJA1D78TBTM320Z8O9XS2MTWHTCL1NSJXGFKIZO6TJB4XIM94OSR6KQNU415QB97MYJEP6T3O0IWR3`},
88+
},
89+
{
90+
name: `valid_python_notebook`,
91+
input: ` "source": [
92+
"Every time you call any function there is an HTTPS request to Google's servers. To prevent your servers IP address being locked by Google we should use a service that handles proxy rotation for us. In this case we are using **ScrapingBee API**.\n",
93+
"\n",
94+
"ScrapingBee API key:\n",
95+
"\n",
96+
" QEUXIXLN8OULIISPZ1FXZUCWF7M42ZOUXRV7491R6RYQTFCSV8A4Y1B2YFPCD0HL2X62KPGTHFODSW6G\n",
97+
"\n",
98+
"NOTE: This API key is available till 08 March 2021 and expires after 200 requests \n",
99+
"NOTE: **this Python package still works out of the box**."
100+
]`,
101+
102+
want: []string{`QEUXIXLN8OULIISPZ1FXZUCWF7M42ZOUXRV7491R6RYQTFCSV8A4Y1B2YFPCD0HL2X62KPGTHFODSW6G`},
103+
},
104+
{
105+
name: `valid_python_nonapiurl`,
106+
input: `##########################################################################################################
107+
# We use the best scraper service API, Scraping Bee.
108+
api_key = "CXUWSH6Y2BRB8F07MB7YXWPYWV2TQ4K51G4N6SGEU1YDADAVDW35ZT7WNISZ8YMCQ810OP9KG22ZI2P2"`,
109+
want: []string{`CXUWSH6Y2BRB8F07MB7YXWPYWV2TQ4K51G4N6SGEU1YDADAVDW35ZT7WNISZ8YMCQ810OP9KG22ZI2P2`},
110+
},
111+
{
112+
name: `valid_underscore`,
113+
input: ` gn = GoogleNews()
114+
115+
# it's a fake API key, do not try to use it
116+
gn.top_news(scraping_bee = 'I5SYNPRFZI41WHVQWWUT0GNXFMO104343E7CXFIISR01E2V8ETSMXMJFK1XNKM7FDEEPUPRM0FYAHFF5')`,
117+
118+
want: []string{`I5SYNPRFZI41WHVQWWUT0GNXFMO104343E7CXFIISR01E2V8ETSMXMJFK1XNKM7FDEEPUPRM0FYAHFF5`},
119+
},
120+
// TODO: support this
121+
// {
122+
// name: `valid_js_suffix`,
123+
// input: ` do {
124+
// // const apiKey = 'TQ9CDAZSORUPU1NMZXZEM11VY7K3NC3HJPBNYP2V4CZZXUY9SWEULNDHOZ77XGWO9FA9A12XWFVWUBZJ';
125+
// // const client = new scrapingbee.ScrapingBeeClient(apiKey);
126+
// `,
127+
//
128+
// want: []string{ `TQ9CDAZSORUPU1NMZXZEM11VY7K3NC3HJPBNYP2V4CZZXUY9SWEULNDHOZ77XGWO9FA9A12XWFVWUBZJ`},
129+
// },
130+
131+
// False positives
132+
{
133+
name: `invalid - lowercase`,
134+
input: `const scrapingbeeKey = 'tq9cdazsorupu1nmzxzem11vy7k3nc3hjpbnyp2v4czzxuy9sweulndhoz77xgwo9fa9a12xwfvwubzj'`,
47135
},
48136
}
49137

138+
d := Scanner{}
139+
ahoCorasickCore := ahocorasick.NewAhoCorasickCore([]detectors.Detector{d})
50140
for _, test := range tests {
51141
t.Run(test.name, func(t *testing.T) {
52142
matchedDetectors := ahoCorasickCore.FindDetectorMatches([]byte(test.input))

0 commit comments

Comments
 (0)