Skip to content

Commit 85936f9

Browse files
committed
docs: crawlFile API related documentation
1 parent ea372e1 commit 85936f9

File tree

3 files changed

+45
-50
lines changed

3 files changed

+45
-50
lines changed

README.md

+12-14
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ myXCrawl.startPolling({ d: 1 }, async (count, stopPolling) => {
174174
}
175175

176176
// Call the crawlFile API to crawl pictures
177-
myXCrawl.crawlFile({ targets, storeDir: './upload' })
177+
myXCrawl.crawlFile({ targets, storeDirs: './upload' })
178178
})
179179
```
180180

@@ -386,9 +386,7 @@ myXCrawl
386386
'https://www.example.com/file-1',
387387
'https://www.example.com/file-2'
388388
],
389-
fileConfig: {
390-
storeDir: './upload' // storage folder
391-
}
389+
storeDirs: './upload' // storage folder
392390
})
393391
.then((res) => {
394392
console.log(res)
@@ -427,10 +425,8 @@ myXCrawl
427425
'https://www.example.com/file-1.jpg',
428426
'https://www.example.com/file-2.jpg'
429427
],
430-
fileConfig: {
431-
onBeforeSaveItemFile(info) {
432-
return sharp(info.data).resize(200).toBuffer()
433-
}
428+
onBeforeSaveItemFile(info) {
429+
return sharp(info.data).resize(200).toBuffer()
434430
}
435431
})
436432
.then((res) => {
@@ -1169,7 +1165,7 @@ myXCrawl
11691165
'https://www.example.com/file-1',
11701166
'https://www.example.com/file-2'
11711167
],
1172-
storeDir: './upload',
1168+
storeDirs: './upload',
11731169
intervalTime: { max: 3000, min: 1000 },
11741170
maxRetry: 1
11751171
})
@@ -1244,7 +1240,7 @@ myXCrawl
12441240
'https://www.example.com/file-1',
12451241
{ url: 'https://www.example.com/file-2', storeDir: './upload/xxx' }
12461242
],
1247-
storeDir: './upload',
1243+
storeDirs: './upload',
12481244
intervalTime: { max: 3000, min: 1000 },
12491245
maxRetry: 1
12501246
})
@@ -1378,7 +1374,7 @@ export interface CrawlFileDetailTargetConfig extends CrawlCommonConfig {
13781374
headers?: AnyObject | null
13791375
priority?: number
13801376
storeDir?: string | null
1381-
fileName?: string
1377+
fileName?: string | null
13821378
extension?: string | null
13831379
fingerprint?: DetailTargetFingerprintCommon | null
13841380
}
@@ -1458,10 +1454,11 @@ export interface CrawlFileAdvancedConfig extends CrawlCommonConfig {
14581454
targets: (string | CrawlFileDetailTargetConfig)[]
14591455
intervalTime?: IntervalTime
14601456
fingerprints?: DetailTargetFingerprintCommon[]
1457+
storeDirs?: string | (string | null)[]
1458+
extensions?: string | (string | null)[]
1459+
fileNames?: (string | null)[]
14611460

14621461
headers?: AnyObject
1463-
storeDir?: string
1464-
extension?: string
14651462

14661463
onCrawlItemComplete?: (crawlFileSingleResult: CrawlFileSingleResult) => void
14671464
onBeforeSaveItemFile?: (info: {
@@ -1478,9 +1475,10 @@ export interface CrawlFileAdvancedConfig extends CrawlCommonConfig {
14781475
- targets: undefined
14791476
- intervalTime: undefined
14801477
- fingerprints: undefined
1481-
- headers: undefined
14821478
- storeDir: \_\_dirname
14831479
- extension: string
1480+
- fileNames: undefined
1481+
- headers: undefined
14841482
- onCrawlItemComplete: undefined
14851483
- onBeforeSaveItemFile: undefined
14861484

docs/cn.md

+11-9
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ myXCrawl.startPolling({ d: 1 }, async (count, stopPolling) => {
172172
}
173173

174174
// 调用 crawlFile API 爬取图片
175-
await myXCrawl.crawlFile({ targets, storeDir: './upload' })
175+
await myXCrawl.crawlFile({ targets, storeDirs: './upload' })
176176
})
177177
```
178178

@@ -385,7 +385,7 @@ myXCrawl
385385
'https://www.example.com/file-1',
386386
'https://www.example.com/file-2'
387387
],
388-
storeDir: './upload' // 存放文件夹
388+
storeDirs: './upload' // 存放文件夹
389389
})
390390
.then((res) => {})
391391
```
@@ -1160,7 +1160,7 @@ myXCrawl
11601160
'https://www.example.com/file-1',
11611161
'https://www.example.com/file-2'
11621162
],
1163-
storeDir: './upload',
1163+
storeDirs: './upload',
11641164
intervalTime: { max: 3000, min: 1000 },
11651165
maxRetry: 1
11661166
})
@@ -1235,7 +1235,7 @@ myXCrawl
12351235
'https://www.example.com/file-1',
12361236
{ url: 'https://www.example.com/file-2', storeDir: './upload/file2' }
12371237
],
1238-
storeDir: './upload',
1238+
storeDirs: './upload',
12391239
intervalTime: { max: 3000, min: 1000 },
12401240
maxRetry: 1
12411241
})
@@ -1369,7 +1369,7 @@ export interface CrawlFileDetailTargetConfig extends CrawlCommonConfig {
13691369
headers?: AnyObject | null
13701370
priority?: number
13711371
storeDir?: string | null
1372-
fileName?: string
1372+
fileName?: string | null
13731373
extension?: string | null
13741374
fingerprint?: DetailTargetFingerprintCommon | null
13751375
}
@@ -1449,10 +1449,11 @@ export interface CrawlFileAdvancedConfig extends CrawlCommonConfig {
14491449
targets: (string | CrawlFileDetailTargetConfig)[]
14501450
intervalTime?: IntervalTime
14511451
fingerprints?: DetailTargetFingerprintCommon[]
1452+
storeDirs?: string | (string | null)[]
1453+
extensions?: string | (string | null)[]
1454+
fileNames?: (string | null)[]
14521455

14531456
headers?: AnyObject
1454-
storeDir?: string
1455-
extension?: string
14561457

14571458
onCrawlItemComplete?: (crawlFileSingleResult: CrawlFileSingleResult) => void
14581459
onBeforeSaveItemFile?: (info: {
@@ -1469,9 +1470,10 @@ export interface CrawlFileAdvancedConfig extends CrawlCommonConfig {
14691470
- targets: undefined
14701471
- intervalTime: undefined
14711472
- fingerprints: undefined
1473+
- storeDirs: \_\_dirname
1474+
- extensions: string
1475+
- fileNames: undefined
14721476
- headers: undefined
1473-
- storeDir: \_\_dirname
1474-
- extension: string
14751477
- onCrawlItemComplete: undefined
14761478
- onBeforeSaveItemFile: undefined
14771479

publish/README.md

+22-27
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,14 @@ x-crawl is a flexible Node.js multifunctional crawler library. Flexible usage an
1010

1111
- **🔥 Asynchronous Synchronous** - Just change the mode property to toggle asynchronous or synchronous crawling mode.
1212
- **⚙️ Multiple purposes** - It can crawl pages, crawl interfaces, crawl files and poll crawls to meet the needs of various scenarios.
13+
- **☁️ Crawl SPA** - Crawl SPA (Single Page Application) to generate pre-rendered content (aka "SSR" (Server Side Rendering)).
14+
- **⚒️ Control Page** - Automate form submission, UI testing, keyboard input, event manipulation, open browser, etc.
1315
- **🖋️ Flexible writing style** - The same crawling API can be adapted to multiple configurations, and each configuration method is very unique.
1416
- **⏱️ Interval Crawling** - No interval, fixed interval and random interval to generate or avoid high concurrent crawling.
1517
- **🔄 Failed Retry** - Avoid crawling failure due to short-term problems, and customize the number of retries.
1618
- **➡️ Proxy Rotation** - Auto-rotate proxies with failure retry, custom error times and HTTP status codes.
1719
- **👀 Device Fingerprinting** - Zero configuration or custom configuration, avoid fingerprinting to identify and track us from different locations.
1820
- **🚀 Priority Queue** - According to the priority of a single crawling target, it can be crawled ahead of other targets.
19-
- **☁️ Crawl SPA** - Crawl SPA (Single Page Application) to generate pre-rendered content (aka "SSR" (Server Side Rendering)).
20-
- **⚒️ Control Page** - You can submit form, keyboard input, event operation, generate screenshots of the page, etc.
2121
- **🧾 Capture Record** - Capture and record crawling, and use colored strings to remind in the terminal.
2222
- **🦾 TypeScript** - Own types, implement complete types through generics.
2323

@@ -136,7 +136,7 @@ Take the automatic acquisition of some photos of experiences and homes around th
136136
import xCrawl from 'x-crawl'
137137

138138
// 2.Create a crawler instance
139-
const myXCrawl = xCrawl({maxRetry: 3,intervalTime: { max: 3000, min: 2000 }})
139+
const myXCrawl = xCrawl({ maxRetry: 3, intervalTime: { max: 3000, min: 2000 } })
140140

141141
// 3.Set the crawling task
142142
/*
@@ -164,20 +164,17 @@ myXCrawl.startPolling({ d: 1 }, async (count, stopPolling) => {
164164
await new Promise((r) => setTimeout(r, 300))
165165

166166
// Gets the URL of the page image
167-
const urls = await page.$$eval(
168-
`${elSelectorMap[id - 1]} img`,
169-
(imgEls) => {
170-
return imgEls.map((item) => item.src)
171-
}
172-
)
167+
const urls = await page.$$eval(`${elSelectorMap[id - 1]} img`, (imgEls) => {
168+
return imgEls.map((item) => item.src)
169+
})
173170
targets.push(...urls)
174171

175172
// Close page
176173
page.close()
177174
}
178175

179176
// Call the crawlFile API to crawl pictures
180-
myXCrawl.crawlFile({ targets, storeDir: './upload' })
177+
myXCrawl.crawlFile({ targets, storeDirs: './upload' })
181178
})
182179
```
183180

@@ -283,7 +280,7 @@ myXCrawl.crawlPage('https://www.example.com').then((res) => {
283280

284281
#### Browser Instance
285282

286-
When you call crawlPage API to crawl pages in the same crawler instance, the browser instance used is the same, because the crawlPage API of the browser instance in the same crawler instance is shared. For specific usage, please refer to [Browser](https://pptr.dev/api/puppeteer.browser).
283+
When you call crawlPage API to crawl pages in the same crawler instance, the browser instance used is the same, because the crawlPage API of the browser instance in the same crawler instance is shared. For specific usage, please refer to [Browser](https://pptr.dev/api/puppeteer.browser).
287284

288285
**Note:** The browser will keep running and the file will not be terminated. If you want to stop, you can execute browser.close() to close it. Do not call [crawlPage](#crawlPage) or [page](#page) if you need to use it later. Because the crawlPage API of the browser instance in the same crawler instance is shared.
289286

@@ -332,9 +329,9 @@ Disable running the browser in headless mode.
332329
import xCrawl from 'x-crawl'
333330

334331
const myXCrawl = xCrawl({
335-
maxRetry: 3,
336-
// Cancel running the browser in headless mode
337-
crawlPage: { launchBrowser: { headless: false } }
332+
maxRetry: 3,
333+
// Cancel running the browser in headless mode
334+
crawlPage: { launchBrowser: { headless: false } }
338335
})
339336

340337
myXCrawl.crawlPage('https://www.example.com').then((res) => {})
@@ -389,9 +386,7 @@ myXCrawl
389386
'https://www.example.com/file-1',
390387
'https://www.example.com/file-2'
391388
],
392-
fileConfig: {
393-
storeDir: './upload' // storage folder
394-
}
389+
storeDirs: './upload' // storage folder
395390
})
396391
.then((res) => {
397392
console.log(res)
@@ -430,10 +425,8 @@ myXCrawl
430425
'https://www.example.com/file-1.jpg',
431426
'https://www.example.com/file-2.jpg'
432427
],
433-
fileConfig: {
434-
onBeforeSaveItemFile(info) {
435-
return sharp(info.data).resize(200).toBuffer()
436-
}
428+
onBeforeSaveItemFile(info) {
429+
return sharp(info.data).resize(200).toBuffer()
437430
}
438431
})
439432
.then((res) => {
@@ -1172,7 +1165,7 @@ myXCrawl
11721165
'https://www.example.com/file-1',
11731166
'https://www.example.com/file-2'
11741167
],
1175-
storeDir: './upload',
1168+
storeDirs: './upload',
11761169
intervalTime: { max: 3000, min: 1000 },
11771170
maxRetry: 1
11781171
})
@@ -1247,7 +1240,7 @@ myXCrawl
12471240
'https://www.example.com/file-1',
12481241
{ url: 'https://www.example.com/file-2', storeDir: './upload/xxx' }
12491242
],
1250-
storeDir: './upload',
1243+
storeDirs: './upload',
12511244
intervalTime: { max: 3000, min: 1000 },
12521245
maxRetry: 1
12531246
})
@@ -1381,7 +1374,7 @@ export interface CrawlFileDetailTargetConfig extends CrawlCommonConfig {
13811374
headers?: AnyObject | null
13821375
priority?: number
13831376
storeDir?: string | null
1384-
fileName?: string
1377+
fileName?: string | null
13851378
extension?: string | null
13861379
fingerprint?: DetailTargetFingerprintCommon | null
13871380
}
@@ -1461,10 +1454,11 @@ export interface CrawlFileAdvancedConfig extends CrawlCommonConfig {
14611454
targets: (string | CrawlFileDetailTargetConfig)[]
14621455
intervalTime?: IntervalTime
14631456
fingerprints?: DetailTargetFingerprintCommon[]
1457+
storeDirs?: string | (string | null)[]
1458+
extensions?: string | (string | null)[]
1459+
fileNames?: (string | null)[]
14641460

14651461
headers?: AnyObject
1466-
storeDir?: string
1467-
extension?: string
14681462

14691463
onCrawlItemComplete?: (crawlFileSingleResult: CrawlFileSingleResult) => void
14701464
onBeforeSaveItemFile?: (info: {
@@ -1481,9 +1475,10 @@ export interface CrawlFileAdvancedConfig extends CrawlCommonConfig {
14811475
- targets: undefined
14821476
- intervalTime: undefined
14831477
- fingerprints: undefined
1484-
- headers: undefined
14851478
- storeDir: \_\_dirname
14861479
- extension: string
1480+
- fileNames: undefined
1481+
- headers: undefined
14871482
- onCrawlItemComplete: undefined
14881483
- onBeforeSaveItemFile: undefined
14891484

0 commit comments

Comments
 (0)