@@ -2,7 +2,6 @@ package scrapingbee
2
2
3
3
import (
4
4
"context"
5
- "fmt"
6
5
"testing"
7
6
8
7
"github.com/google/go-cmp/cmp"
@@ -11,42 +10,133 @@ import (
11
10
"github.com/trufflesecurity/trufflehog/v3/pkg/engine/ahocorasick"
12
11
)
13
12
14
- var (
15
- validPattern = "HOLTCTPVL3V35NK795FY6INWQKTTTSGL6W26JKHLC3CGUNJ1DV35SWPUAZ26BUX2EEZZ7T2SJBOL8WZB"
16
- invalidPattern = "HOLT?TPVL3V35NK795FY6INWQKTTTSGL6W26JKHLC3CGUNJ1DV35SWPUAZ26BUX2EEZZ7T2SJBOL8WZB"
17
- keyword = "scrapingbee"
18
- )
19
-
20
13
func TestScrapingBee_Pattern (t * testing.T ) {
21
- d := Scanner {}
22
- ahoCorasickCore := ahocorasick .NewAhoCorasickCore ([]detectors.Detector {d })
23
14
tests := []struct {
24
15
name string
25
16
input string
26
17
want []string
27
18
}{
19
+ // True positives
20
+ {
21
+ name : `valid_query_param` ,
22
+ input : ` #CHANGE API KEY TO CURRENT API KEY ON SCRAPINGBEE BELOW:
23
+ uri = URI("https://app.scrapingbee.com/api/v1/?api_key=VNC7VJ04BQLZWL821KJ4ZLG17ON45K4Y56P59QZMDNZBWRFAS0LIK47I3KFH6AMLUXPHIUIFBDOMIOUE&url=#{url}&stealth_proxy=True&country_code=sg&wait_browser=networkidle2&json_response=True&block_resources=False&block_ads=True&js_scenario=" + CGI.escape(js_scenario))` ,
24
+ want : []string {`VNC7VJ04BQLZWL821KJ4ZLG17ON45K4Y56P59QZMDNZBWRFAS0LIK47I3KFH6AMLUXPHIUIFBDOMIOUE` },
25
+ },
26
+ {
27
+ name : `valid_function_comment` ,
28
+ input : `func connectToScrapingBee() {
29
+ // API KEY = M977YHXCMPJJ569DSB0B8KSKL9NRU2O2327MIDT55785T8LS9TJGDW4GFMCMOZNRVN3GPSXF0Y6DGC32` ,
30
+ want : []string {`M977YHXCMPJJ569DSB0B8KSKL9NRU2O2327MIDT55785T8LS9TJGDW4GFMCMOZNRVN3GPSXF0Y6DGC32` },
31
+ },
28
32
{
29
- name : "valid pattern - with keyword scrapingbee" ,
30
- input : fmt .Sprintf ("%s token = '%s'" , keyword , validPattern ),
31
- want : []string {validPattern },
33
+ name : `valid_csharp` ,
34
+ input : ` class test{
35
+
36
+ string BASE_URL = @"https://app.scrapingbee.com/api/v1/";
37
+ string API_KEY = "2OZ3HPYEUP9LVCN9TSMBEP5OU0C65AXL7MDO76VPYQNVAJW8NU0QUQQPEV7C51XQDLZUUYKZ5TAW2L85";
38
+
39
+ public static string Get(string url)` ,
40
+ want : []string {`2OZ3HPYEUP9LVCN9TSMBEP5OU0C65AXL7MDO76VPYQNVAJW8NU0QUQQPEV7C51XQDLZUUYKZ5TAW2L85` },
41
+ },
42
+ {
43
+ name : `valid_js1` ,
44
+ input : ` const options = {
45
+ uri: "https://app.scrapingbee.com/api/v1?",
46
+ api_key: "34TOQQ77QJALLR07ISPYL4B5EYHW3YLU5GM97GQOCA32BVW3S0S6RTVFCZGTHZ1Q5MHH1Z9GZ0B640LI",
47
+ };` ,
48
+ want : []string {`34TOQQ77QJALLR07ISPYL4B5EYHW3YLU5GM97GQOCA32BVW3S0S6RTVFCZGTHZ1Q5MHH1Z9GZ0B640LI` },
32
49
},
33
50
{
34
- name : "valid pattern - ignore duplicate" ,
35
- input : fmt .Sprintf ("%s token = '%s' | '%s'" , keyword , validPattern , validPattern ),
36
- want : []string {validPattern },
51
+ name : `valid_js2` ,
52
+ input : ` useEffect(() => {
53
+ setLoading(true)
54
+ base.get('https://app.scrapingbee.com/api/v1', {
55
+ params:{'api_key':'BYZCNNS0SOZCPC4EXD5SXSH0PWAXPWFMZ4SXVEQNEDMKSGBP57K31PJ44V46344XCYN7IARKQWLS0V3X',
56
+ 'url': 'https://www.flipkart.com/search?q=${searchItem}',
57
+ 'block_resources': 'false',
58
+ }
59
+ }).then((response) => {` ,
60
+ want : []string {`BYZCNNS0SOZCPC4EXD5SXSH0PWAXPWFMZ4SXVEQNEDMKSGBP57K31PJ44V46344XCYN7IARKQWLS0V3X` },
37
61
},
38
62
{
39
- name : "valid pattern - key out of prefix range" ,
40
- input : fmt .Sprintf ("%s keyword is not close to the real key in the data\n = '%s'" , keyword , validPattern ),
41
- want : []string {},
63
+ name : `valid_js3` ,
64
+ input : `const scrapingBeeApiKey =
65
+ "P5IS953T7OYL5KJG8J3SVPAV5VUJ49L2OXB7HIQDVL8SSG7O9A3J6DQ6CTK65KEAM7L7MQJIEW20ZOCP"; // Replace 'YOUR_SCRAPING_BEE_API_KEY' with your actual API key` ,
66
+ want : []string {`P5IS953T7OYL5KJG8J3SVPAV5VUJ49L2OXB7HIQDVL8SSG7O9A3J6DQ6CTK65KEAM7L7MQJIEW20ZOCP` },
42
67
},
43
68
{
44
- name : "invalid pattern" ,
45
- input : fmt .Sprintf ("%s = '%s'" , keyword , invalidPattern ),
46
- want : []string {},
69
+ name : `valid_php` ,
70
+ input : `// Set base url & API key
71
+ $BASE_URL = "https://app.scrapingbee.com/api/v1/?";
72
+ $API_KEY = "R4EEK5MWM2GXNK1TZUU9Z0EBA29ZUW7PW12MHI4T1BHSR7GM1G37C5BL2NHLPWC0J6VOQWP5IZJ15QV8";
73
+ ` ,
74
+ want : []string {`R4EEK5MWM2GXNK1TZUU9Z0EBA29ZUW7PW12MHI4T1BHSR7GM1G37C5BL2NHLPWC0J6VOQWP5IZJ15QV8` },
75
+ },
76
+ {
77
+ name : `valid_python_sdk` ,
78
+ input : `client = ScrapingBeeClient(api_key='MZ13G1AVV8C5MEYVOIMIGJEPUH0PBSJPYTCO6IUWRZS3BXNOLA4TUP27ZGQ97LS8NRBCO66WF3ZUKSFX')` ,
79
+ want : []string {`MZ13G1AVV8C5MEYVOIMIGJEPUH0PBSJPYTCO6IUWRZS3BXNOLA4TUP27ZGQ97LS8NRBCO66WF3ZUKSFX` },
80
+ },
81
+ {
82
+ name : `valid_python_sdk_newline` ,
83
+ input : `def main():
84
+ client = ScrapingBeeClient(
85
+ api_key='E1PJA1D78TBTM320Z8O9XS2MTWHTCL1NSJXGFKIZO6TJB4XIM94OSR6KQNU415QB97MYJEP6T3O0IWR3')` ,
86
+
87
+ want : []string {`E1PJA1D78TBTM320Z8O9XS2MTWHTCL1NSJXGFKIZO6TJB4XIM94OSR6KQNU415QB97MYJEP6T3O0IWR3` },
88
+ },
89
+ {
90
+ name : `valid_python_notebook` ,
91
+ input : ` "source": [
92
+ "Every time you call any function there is an HTTPS request to Google's servers. To prevent your servers IP address being locked by Google we should use a service that handles proxy rotation for us. In this case we are using **ScrapingBee API**.\n",
93
+ "\n",
94
+ "ScrapingBee API key:\n",
95
+ "\n",
96
+ " QEUXIXLN8OULIISPZ1FXZUCWF7M42ZOUXRV7491R6RYQTFCSV8A4Y1B2YFPCD0HL2X62KPGTHFODSW6G\n",
97
+ "\n",
98
+ "NOTE: This API key is available till 08 March 2021 and expires after 200 requests \n",
99
+ "NOTE: **this Python package still works out of the box**."
100
+ ]` ,
101
+
102
+ want : []string {`QEUXIXLN8OULIISPZ1FXZUCWF7M42ZOUXRV7491R6RYQTFCSV8A4Y1B2YFPCD0HL2X62KPGTHFODSW6G` },
103
+ },
104
+ {
105
+ name : `valid_python_nonapiurl` ,
106
+ input : `##########################################################################################################
107
+ # We use the best scraper service API, Scraping Bee.
108
+ api_key = "CXUWSH6Y2BRB8F07MB7YXWPYWV2TQ4K51G4N6SGEU1YDADAVDW35ZT7WNISZ8YMCQ810OP9KG22ZI2P2"` ,
109
+ want : []string {`CXUWSH6Y2BRB8F07MB7YXWPYWV2TQ4K51G4N6SGEU1YDADAVDW35ZT7WNISZ8YMCQ810OP9KG22ZI2P2` },
110
+ },
111
+ {
112
+ name : `valid_underscore` ,
113
+ input : ` gn = GoogleNews()
114
+
115
+ # it's a fake API key, do not try to use it
116
+ gn.top_news(scraping_bee = 'I5SYNPRFZI41WHVQWWUT0GNXFMO104343E7CXFIISR01E2V8ETSMXMJFK1XNKM7FDEEPUPRM0FYAHFF5')` ,
117
+
118
+ want : []string {`I5SYNPRFZI41WHVQWWUT0GNXFMO104343E7CXFIISR01E2V8ETSMXMJFK1XNKM7FDEEPUPRM0FYAHFF5` },
119
+ },
120
+ // TODO: support this
121
+ // {
122
+ // name: `valid_js_suffix`,
123
+ // input: ` do {
124
+ // // const apiKey = 'TQ9CDAZSORUPU1NMZXZEM11VY7K3NC3HJPBNYP2V4CZZXUY9SWEULNDHOZ77XGWO9FA9A12XWFVWUBZJ';
125
+ // // const client = new scrapingbee.ScrapingBeeClient(apiKey);
126
+ // `,
127
+ //
128
+ // want: []string{ `TQ9CDAZSORUPU1NMZXZEM11VY7K3NC3HJPBNYP2V4CZZXUY9SWEULNDHOZ77XGWO9FA9A12XWFVWUBZJ`},
129
+ // },
130
+
131
+ // False positives
132
+ {
133
+ name : `invalid - lowercase` ,
134
+ input : `const scrapingbeeKey = 'tq9cdazsorupu1nmzxzem11vy7k3nc3hjpbnyp2v4czzxuy9sweulndhoz77xgwo9fa9a12xwfvwubzj'` ,
47
135
},
48
136
}
49
137
138
+ d := Scanner {}
139
+ ahoCorasickCore := ahocorasick .NewAhoCorasickCore ([]detectors.Detector {d })
50
140
for _ , test := range tests {
51
141
t .Run (test .name , func (t * testing.T ) {
52
142
matchedDetectors := ahoCorasickCore .FindDetectorMatches ([]byte (test .input ))
0 commit comments