@@ -10,14 +10,14 @@ x-crawl is a flexible Node.js multifunctional crawler library. Flexible usage an
10
10
11
11
- ** 🔥 Asynchronous Synchronous** - Just change the mode property to toggle asynchronous or synchronous crawling mode.
12
12
- ** ⚙️ Multiple purposes** - It can crawl pages, crawl interfaces, crawl files and poll crawls to meet the needs of various scenarios.
13
+ - ** ☁️ Crawl SPA** - Crawl SPA (Single Page Application) to generate pre-rendered content (aka "SSR" (Server Side Rendering)).
14
+ - ** ⚒️ Control Page** - Automate form submission, UI testing, keyboard input, event manipulation, open browser, etc.
13
15
- ** 🖋️ Flexible writing style** - The same crawling API can be adapted to multiple configurations, and each configuration method is very unique.
14
16
- ** ⏱️ Interval Crawling** - No interval, fixed interval and random interval to generate or avoid high concurrent crawling.
15
17
- ** 🔄 Failed Retry** - Avoid crawling failure due to short-term problems, and customize the number of retries.
16
18
- ** ➡️ Proxy Rotation** - Auto-rotate proxies with failure retry, custom error times and HTTP status codes.
17
19
- ** 👀 Device Fingerprinting** - Zero configuration or custom configuration, avoid fingerprinting to identify and track us from different locations.
18
20
- ** 🚀 Priority Queue** - According to the priority of a single crawling target, it can be crawled ahead of other targets.
19
- - ** ☁️ Crawl SPA** - Crawl SPA (Single Page Application) to generate pre-rendered content (aka "SSR" (Server Side Rendering)).
20
- - ** ⚒️ Control Page** - You can submit form, keyboard input, event operation, generate screenshots of the page, etc.
21
21
- ** 🧾 Capture Record** - Capture and record crawling, and use colored strings to remind in the terminal.
22
22
- ** 🦾 TypeScript** - Own types, implement complete types through generics.
23
23
@@ -136,7 +136,7 @@ Take the automatic acquisition of some photos of experiences and homes around th
136
136
import xCrawl from ' x-crawl'
137
137
138
138
// 2.Create a crawler instance
139
- const myXCrawl = xCrawl ({maxRetry: 3 ,intervalTime: { max: 3000 , min: 2000 }})
139
+ const myXCrawl = xCrawl ({ maxRetry: 3 , intervalTime: { max: 3000 , min: 2000 } })
140
140
141
141
// 3.Set the crawling task
142
142
/*
@@ -164,20 +164,17 @@ myXCrawl.startPolling({ d: 1 }, async (count, stopPolling) => {
164
164
await new Promise ((r ) => setTimeout (r, 300 ))
165
165
166
166
// Gets the URL of the page image
167
- const urls = await page .$$eval (
168
- ` ${ elSelectorMap[id - 1 ]} img` ,
169
- (imgEls ) => {
170
- return imgEls .map ((item ) => item .src )
171
- }
172
- )
167
+ const urls = await page .$$eval (` ${ elSelectorMap[id - 1 ]} img` , (imgEls ) => {
168
+ return imgEls .map ((item ) => item .src )
169
+ })
173
170
targets .push (... urls)
174
171
175
172
// Close page
176
173
page .close ()
177
174
}
178
175
179
176
// Call the crawlFile API to crawl pictures
180
- myXCrawl .crawlFile ({ targets, storeDir : ' ./upload' })
177
+ myXCrawl .crawlFile ({ targets, storeDirs : ' ./upload' })
181
178
})
182
179
```
183
180
@@ -283,7 +280,7 @@ myXCrawl.crawlPage('https://www.example.com').then((res) => {
283
280
284
281
#### Browser Instance
285
282
286
- When you call crawlPage API to crawl pages in the same crawler instance, the browser instance used is the same, because the crawlPage API of the browser instance in the same crawler instance is shared. For specific usage, please refer to [ Browser] ( https://pptr.dev/api/puppeteer.browser ) .
283
+ When you call crawlPage API to crawl pages in the same crawler instance, the browser instance used is the same, because the crawlPage API of the browser instance in the same crawler instance is shared. For specific usage, please refer to [ Browser] ( https://pptr.dev/api/puppeteer.browser ) .
287
284
288
285
** Note:** The browser will keep running and the file will not be terminated. If you want to stop, you can execute browser.close() to close it. Do not call [ crawlPage] ( #crawlPage ) or [ page] ( #page ) if you need to use it later. Because the crawlPage API of the browser instance in the same crawler instance is shared.
289
286
@@ -332,9 +329,9 @@ Disable running the browser in headless mode.
332
329
import xCrawl from ' x-crawl'
333
330
334
331
const myXCrawl = xCrawl ({
335
- maxRetry: 3 ,
336
- // Cancel running the browser in headless mode
337
- crawlPage: { launchBrowser: { headless: false } }
332
+ maxRetry: 3 ,
333
+ // Cancel running the browser in headless mode
334
+ crawlPage: { launchBrowser: { headless: false } }
338
335
})
339
336
340
337
myXCrawl .crawlPage (' https://www.example.com' ).then ((res ) => {})
@@ -389,9 +386,7 @@ myXCrawl
389
386
' https://www.example.com/file-1' ,
390
387
' https://www.example.com/file-2'
391
388
],
392
- fileConfig: {
393
- storeDir: ' ./upload' // storage folder
394
- }
389
+ storeDirs: ' ./upload' // storage folder
395
390
})
396
391
.then ((res ) => {
397
392
console .log (res)
@@ -430,10 +425,8 @@ myXCrawl
430
425
' https://www.example.com/file-1.jpg' ,
431
426
' https://www.example.com/file-2.jpg'
432
427
],
433
- fileConfig: {
434
- onBeforeSaveItemFile (info ) {
435
- return sharp (info .data ).resize (200 ).toBuffer ()
436
- }
428
+ onBeforeSaveItemFile (info ) {
429
+ return sharp (info .data ).resize (200 ).toBuffer ()
437
430
}
438
431
})
439
432
.then ((res ) => {
@@ -1172,7 +1165,7 @@ myXCrawl
1172
1165
' https://www.example.com/file-1' ,
1173
1166
' https://www.example.com/file-2'
1174
1167
],
1175
- storeDir : ' ./upload' ,
1168
+ storeDirs : ' ./upload' ,
1176
1169
intervalTime: { max: 3000 , min: 1000 },
1177
1170
maxRetry: 1
1178
1171
})
@@ -1247,7 +1240,7 @@ myXCrawl
1247
1240
' https://www.example.com/file-1' ,
1248
1241
{ url: ' https://www.example.com/file-2' , storeDir: ' ./upload/xxx' }
1249
1242
],
1250
- storeDir : ' ./upload' ,
1243
+ storeDirs : ' ./upload' ,
1251
1244
intervalTime: { max: 3000 , min: 1000 },
1252
1245
maxRetry: 1
1253
1246
})
@@ -1381,7 +1374,7 @@ export interface CrawlFileDetailTargetConfig extends CrawlCommonConfig {
1381
1374
headers ?: AnyObject | null
1382
1375
priority ?: number
1383
1376
storeDir ?: string | null
1384
- fileName ?: string
1377
+ fileName ?: string | null
1385
1378
extension ?: string | null
1386
1379
fingerprint ?: DetailTargetFingerprintCommon | null
1387
1380
}
@@ -1461,10 +1454,11 @@ export interface CrawlFileAdvancedConfig extends CrawlCommonConfig {
1461
1454
targets : (string | CrawlFileDetailTargetConfig )[]
1462
1455
intervalTime ?: IntervalTime
1463
1456
fingerprints ?: DetailTargetFingerprintCommon []
1457
+ storeDirs ?: string | (string | null )[]
1458
+ extensions ?: string | (string | null )[]
1459
+ fileNames ?: (string | null )[]
1464
1460
1465
1461
headers ?: AnyObject
1466
- storeDir ?: string
1467
- extension ?: string
1468
1462
1469
1463
onCrawlItemComplete ?: (crawlFileSingleResult : CrawlFileSingleResult ) => void
1470
1464
onBeforeSaveItemFile ?: (info : {
@@ -1481,9 +1475,10 @@ export interface CrawlFileAdvancedConfig extends CrawlCommonConfig {
1481
1475
- targets: undefined
1482
1476
- intervalTime: undefined
1483
1477
- fingerprints: undefined
1484
- - headers: undefined
1485
1478
- storeDir: \_\_ dirname
1486
1479
- extension: string
1480
+ - fileNames: undefined
1481
+ - headers: undefined
1487
1482
- onCrawlItemComplete: undefined
1488
1483
- onBeforeSaveItemFile: undefined
1489
1484
0 commit comments