@@ -16,7 +16,7 @@ x-crawl is a Nodejs multifunctional crawler library.
16
16
17
17
## Relationship with puppeteer
18
18
19
- The fetchHTML API internally uses the [ puppeteer ] ( https://github.com/puppeteer/puppeteer ) library to crawl pages.
19
+ The fetchPage API internally uses the [ puppeteer] ( https://github.com/puppeteer/puppeteer ) library to crawl pages.
20
20
21
21
The following can be done:
22
22
@@ -34,7 +34,7 @@ The following can be done:
34
34
+ [ Example] ( #Example-1 )
35
35
+ [ Mode] ( #Mode )
36
36
+ [ IntervalTime] ( #IntervalTime )
37
- * [ fetchHTML ] ( #fetchHTML )
37
+ * [ fetchPage ] ( #fetchPage )
38
38
+ [ Type] ( #Type-2 )
39
39
+ [ Example] ( #Example-2 )
40
40
+ [ About page] ( #About-page )
@@ -50,19 +50,19 @@ The following can be done:
50
50
- [ Types] ( #Types )
51
51
* [ AnyObject] ( #AnyObject )
52
52
* [ Method] ( #Method )
53
+ * [ RequestBaseConfig] ( #RequestBaseConfig )
53
54
* [ RequestConfig] ( #RequestConfig )
54
55
* [ IntervalTime] ( #IntervalTime )
55
56
* [ XCrawlBaseConfig] ( #XCrawlBaseConfig )
56
57
* [ FetchBaseConfigV1] ( #FetchBaseConfigV1 )
57
- * [ FetchBaseConfigV2] ( #FetchBaseConfigV2 )
58
- * [ FetchHTMLConfig] ( #FetchHTMLConfig )
58
+ * [ FetchPageConfig] ( #FetchPageConfig )
59
59
* [ FetchDataConfig] ( #FetchDataConfig )
60
60
* [ FetchFileConfig] ( #FetchFileConfig )
61
61
* [ StartPollingConfig] ( #StartPollingConfig )
62
62
* [ FetchResCommonV1] ( #FetchResCommonV1 )
63
63
* [ FetchResCommonArrV1] ( #FetchResCommonArrV1 )
64
64
* [ FileInfo] ( #FileInfo )
65
- * [ FetchHTML ] ( #FetchHTML )
65
+ * [ FetchPage ] ( #FetchPage )
66
66
- [ More] ( #More )
67
67
68
68
## Install
@@ -90,9 +90,9 @@ const myXCrawl = xCrawl({
90
90
// 3.Set the crawling task
91
91
// Call the startPolling API to start the polling function, and the callback function will be called every other day
92
92
myXCrawl .startPolling ({ d: 1 }, () => {
93
- // Call fetchHTML API to crawl HTML
94
- myXCrawl .fetchHTML (' https://www.youtube.com/' ).then ((res ) => {
95
- const { jsdom } = res .data // By default, the JSDOM library is used to parse HTML
93
+ // Call fetchPage API to crawl Page
94
+ myXCrawl .fetchPage (' https://www.youtube.com/' ).then ((res ) => {
95
+ const { jsdom } = res .data // By default, the JSDOM library is used to parse Page
96
96
97
97
// Get the cover image element of the Promoted Video
98
98
const imgEls = jsdom .window .document .querySelectorAll (
@@ -124,7 +124,7 @@ running result:
124
124
<img src =" https://raw.githubusercontent.com/coder-hxl/x-crawl/main/assets/en/crawler-result.png " />
125
125
</div >
126
126
127
- ** Note:** Do not crawl randomly, here is just to demonstrate how to use XCrawl , and control the request frequency within 3000ms to 2000ms.
127
+ ** Note:** Do not crawl randomly, here is just to demonstrate how to use x-crawl , and control the request frequency within 3000ms to 2000ms.
128
128
129
129
## Core concepts
130
130
@@ -154,9 +154,9 @@ const myXCrawl = xCrawl({
154
154
})
155
155
` ` `
156
156
157
- Passing ** baseConfig ** is for ** fetchHTML / fetchData / fetchFile ** to use these values by default .
157
+ Passing ** baseConfig ** is for ** fetchPage / fetchData / fetchFile ** to use these values by default .
158
158
159
- ** Note :** To avoid repeated creation of instances in subsequent examples , ** myXCrawl ** here will be the crawler instance in the ** fetchHTML / fetchData / fetchFile ** example .
159
+ ** Note :** To avoid repeated creation of instances in subsequent examples , ** myXCrawl ** here will be the crawler instance in the ** fetchPage / fetchData / fetchFile ** example .
160
160
161
161
#### Mode
162
162
@@ -176,26 +176,26 @@ The intervalTime option defaults to undefined . If there is a setting value, it
176
176
177
177
The first request is not to trigger the interval .
178
178
179
- ### fetchHTML
179
+ ### fetchPage
180
180
181
- fetchHTML is the method of the above [myXCrawl ](https :// github.com/coder-hxl/x-crawl#Example-1) instance, usually used to crawl page.
181
+ fetchPage is the method of the above [myXCrawl ](https :// github.com/coder-hxl/x-crawl#Example-1) instance, usually used to crawl page.
182
182
183
183
#### Type
184
184
185
- - Look at the [FetchHTMLConfig ](#FetchHTMLConfig ) type
186
- - Look at the [FetchHTML ](#FetchHTML - 2 ) type
185
+ - Look at the [FetchPageConfig ](#FetchPageConfig ) type
186
+ - Look at the [FetchPage ](#FetchPage - 2 ) type
187
187
188
188
` ` ` ts
189
- function fetchHTML : (
190
- config: FetchHTMLConfig ,
191
- callback?: (res: FetchHTML ) => void
192
- ) => Promise<FetchHTML >
189
+ function fetchPage : (
190
+ config: FetchPageConfig ,
191
+ callback?: (res: FetchPage ) => void
192
+ ) => Promise<FetchPage >
193
193
` ` `
194
194
195
195
#### Example
196
196
197
197
` ` ` js
198
- myXCrawl.fetchHTML ('/xxx').then((res) => {
198
+ myXCrawl.fetchPage ('/xxx').then((res) => {
199
199
const { jsdom } = res.data
200
200
console.log(jsdom.window.document.querySelector('title')?.textContent)
201
201
})
@@ -296,7 +296,7 @@ function startPolling(
296
296
` ` ` js
297
297
myXCrawl.startPolling({ h: 1, m: 30 }, () => {
298
298
// will be executed every one and a half hours
299
- // fetchHTML /fetchData/fetchFile
299
+ // fetchPage /fetchData/fetchFile
300
300
})
301
301
` ` `
302
302
@@ -316,17 +316,24 @@ interface AnyObject extends Object {
316
316
type Method = 'get' | 'GET' | 'delete' | 'DELETE' | 'head' | 'HEAD' | 'options' | 'OPTONS' | 'post' | 'POST' | 'put' | 'PUT' | 'patch' | 'PATCH' | 'purge' | 'PURGE' | 'link' | 'LINK' | 'unlink' | 'UNLINK'
317
317
` ` `
318
318
319
+ ### RequestBaseConfig
320
+
321
+ ` ` ` ts
322
+ interface RequestBaseConfig {
323
+ url: string
324
+ timeout?: number
325
+ proxy?: string
326
+ }
327
+ ` ` `
328
+
319
329
### RequestConfig
320
330
321
331
` ` ` ts
322
- interface RequestConfig {
323
- url: string
332
+ interface RequestConfig extends RequestBaseConfig {
324
333
method?: Method
325
334
headers?: AnyObject
326
335
params?: AnyObject
327
336
data?: any
328
- timeout?: number
329
- proxy?: string
330
337
}
331
338
` ` `
332
339
@@ -360,20 +367,10 @@ interface FetchBaseConfigV1 {
360
367
}
361
368
` ` `
362
369
363
- ### FetchBaseConfigV2
364
-
365
- ` ` ` ts
366
- interface FetchBaseConfigV2 {
367
- url: string
368
- timeout?: number
369
- proxy?: string
370
- }
371
- ` ` `
372
-
373
- ### FetchHTMLConfig
370
+ ### FetchPageConfig
374
371
375
372
` ` ` ts
376
- type FetchHTMLConfig = string | FetchBaseConfigV2
373
+ type FetchPageConfig = string | RequestBaseConfig
377
374
` ` `
378
375
379
376
### FetchDataConfig
@@ -432,10 +429,10 @@ interface FileInfo {
432
429
}
433
430
` ` `
434
431
435
- ### FetchHTML
432
+ ### FetchPage
436
433
437
434
` ` ` ts
438
- interface FetchHTML {
435
+ interface FetchPage {
439
436
httpResponse: HTTPResponse | null // The type of HTTPResponse in the puppeteer library
440
437
data: {
441
438
page: Page // The type of Page in the puppeteer library
0 commit comments