other

coder-hxl · coder-hxl · commit 24fd55673cef · 2023-02-28T11:03:20.000+08:00
diff --git a/README.md b/README.md
@@ -6,13 +6,19 @@ x-crawl is a Nodejs multifunctional crawler library.
 
 ## Feature
 
-- Crawl HTML, JSON, file resources, etc. with simple configuration
-- Use puppeteer to crawl HTML, and use JSDOM library to parse HTML, or parse HTML by yourself
-- Support asynchronous/synchronous way to crawl data
-- Support Promise/Callback way to get the result
-- Polling function
-- Anthropomorphic request interval
-- Written in TypeScript, provides generics
+- Crawl HTML, JSON, file resources, etc. with simple configuration.
+- Built-in puppeteer crawls HTML and uses JSDOM library to parse HTML.
+- Support asynchronous/synchronous way to crawl data.
+- Support Promise/Callback way to get the result.
+- Polling function.
+- Anthropomorphic request interval.
+- Written in TypeScript, provides generics.
+
+## Benefits provided by using puppeter
+
+- Generate screenshots and PDFs of pages.
+- Crawl a SPA (Single-Page Application) and generate pre-rendered content (i.e. "SSR" (Server-Side Rendering)).
+- Automate form submission, UI testing, keyboard input, etc.
 
 # Table of Contents
 
@@ -41,14 +47,15 @@ x-crawl is a Nodejs multifunctional crawler library.
     * [Method](#Method)
     * [RequestConfig](#RequestConfig)
     * [IntervalTime](#IntervalTime)
-    * [FetchBaseConifg](#FetchBaseConifg)
     * [XCrawlBaseConifg](#XCrawlBaseConifg)
+    * [FetchBaseConifgV1](#FetchBaseConifgV1)
+    * [FetchBaseConifgV2](#FetchBaseConifgV2)
     * [FetchHTMLConfig](#FetchHTMLConfig	)
-    * [FetchDataConfig](#FetchDataConfig)   
+    * [FetchDataConfig](#FetchDataConfig) 
     * [FetchFileConfig](#FetchFileConfig)
     * [StartPollingConfig](#StartPollingConfig)
-    * [FetchCommon](#FetchCommon)
-    * [FetchCommonArr](#FetchCommonArr)
+    * [FetchResCommonV1](#FetchResCommonV1)
+    * [FetchResCommonArrV1](#FetchResCommonArrV1)
     * [FileInfo](#FileInfo)
     * [FetchHTML](#FetchHTML)
 - [More](#More)
@@ -318,7 +325,6 @@ interface FetchBaseConifgV1 {
 ```ts
 interface FetchBaseConifgV2 {
   url: string
-  header?: AnyObject
   timeout?: number
   proxy?: string
 }
@@ -364,7 +370,7 @@ interface StartPollingConfig {
 interface FetchCommon<T> {
   id: number
   statusCode: number | undefined
-  headers: IncomingHttpHeaders // node: http type
+  headers: IncomingHttpHeaders // nodejs: http type
   data: T
 }
 ```
@@ -392,8 +398,7 @@ interface FileInfo {
 interface FetchHTML {
   httpResponse: HTTPResponse | null // The type of HTTPResponse in the puppeteer library
   data: {
-    page: Page
-    content: string
+    page: Page // The type of Page in the puppeteer library
     jsdom: JSDOM // The type of JSDOM in the jsdom library
   }
 }
diff --git a/docs/cn.md b/docs/cn.md
@@ -6,13 +6,19 @@ x-crawl 是 Nodejs 多功能爬虫库。
 
 ## 特征
 
-- 只需简单的配置即可抓取 HTML 、JSON、文件资源等等
-- 使用 puppeteer 爬取 HTML ，并用 JSDOM 库对 HTML 解析，也可自行解析 HTML
-- 支持 异步/同步 方式爬取数据
-- 支持 Promise/Callback 方式获取结果
-- 轮询功能
-- 拟人化的请求间隔时间
-- 使用 TypeScript 编写，提供泛型
+- 只需简单的配置即可抓取 HTML 、JSON、文件资源等等。
+- 内置 puppeteer 爬取 HTML ，并用 JSDOM 库对 HTML 解析。
+- 支持 异步/同步 方式爬取数据。
+- 支持 Promise/Callback 方式获取结果。
+- 轮询功能。
+- 拟人化的请求间隔时间。
+- 使用 TypeScript 编写，提供泛型。
+
+## 使用 puppeter 提供的好处
+
+- 生成页面的屏幕截图和 PDF。
+- 抓取 SPA（单页应用程序）并生成预渲染内容（即“SSR”（服务器端渲染））。
+- 自动化表单提交、UI 测试、键盘输入等。
 
 # 目录
 
@@ -41,14 +47,15 @@ x-crawl 是 Nodejs 多功能爬虫库。
     * [Method](#Method)
     * [RequestConfig](#RequestConfig)
     * [IntervalTime](#IntervalTime)
-    * [FetchBaseConifg](#FetchBaseConifg)
     * [XCrawlBaseConifg](#XCrawlBaseConifg)
+    * [FetchBaseConifgV1](#FetchBaseConifgV1)
+    * [FetchBaseConifgV2](#FetchBaseConifgV2)
     * [FetchHTMLConfig](#FetchHTMLConfig	)
     * [FetchDataConfig](#FetchDataConfig) 
     * [FetchFileConfig](#FetchFileConfig)
-    * [FetchPollingConfig](#FetchPollingConfig)
-    * [FetchCommon](#FetchCommon)
-    * [FetchCommonArr](#FetchCommonArr)
+    * [StartPollingConfig](#StartPollingConfig)
+    * [FetchResCommonV1](#FetchResCommonV1)
+    * [FetchResCommonArrV1](#FetchResCommonArrV1)
     * [FileInfo](#FileInfo)
     * [FetchHTML](#FetchHTML)
 - [更多](#更多)
@@ -63,7 +70,7 @@ npm install x-crawl
 
 ## 示例
 
-每隔一天就获取 bilibili 国漫主页的推荐轮播图片为例: 
+每隔一天就获取 bilibili 国漫主页的轮播图片为例: 
 
 ```js
 // 1.导入模块 ES/CJS
@@ -76,14 +83,14 @@ const myXCrawl = xCrawl({
 })
 
 // 3.设置爬取任务
-// 调用 fetchPolling API 开始轮询功能，每隔一天会调用回调函数
-myXCrawl.fetchPolling({ d: 1 }, () => {
+// 调用 startPolling API 开始轮询功能，每隔一天会调用回调函数
+myXCrawl.startPolling({ d: 1 }, () => {
   // 调用 fetchHTML API 爬取 HTML
   myXCrawl.fetchHTML('https://www.bilibili.com/guochuang/').then((res) => {
     const { jsdom } = res.data // 默认使用了 JSDOM 库解析 HTML
 
     // 获取轮播图片元素
-    const imgEls = jsdom.window.document.querySelectorAll('.chief-recom-item img')
+    const imgEls = jsdom.window.document.querySelectorAll('.carousel-wrapper .chief-recom-item img')
 
     // 设置请求配置
     const requestConifg = []
@@ -342,7 +349,6 @@ interface FetchBaseConifgV1 {
 ```ts
 interface FetchBaseConifgV2 {
   url: string
-  header?: AnyObject
   timeout?: number
   proxy?: string
 }
@@ -388,7 +394,7 @@ interface StartPollingConfig {
 interface FetchCommon<T> {
   id: number
   statusCode: number | undefined
-  headers: IncomingHttpHeaders // node: http 类型
+  headers: IncomingHttpHeaders // nodejs: http 类型
   data: T
 }
 ```
@@ -416,8 +422,7 @@ interface FileInfo {
 interface FetchHTML {
   httpResponse: HTTPResponse | null // puppeteer 库的 HTTPResponse 类型
   data: {
-    page: Page
-    content: string
+    page: Page // puppeteer 库的 Page 类型
     jsdom: JSDOM
   }
 }
diff --git a/package.json b/package.json
@@ -1,7 +1,7 @@
 {
   "private": true,
   "name": "x-crawl",
-  "version": "2.0.0",
+  "version": "2.1.0",
   "author": "coderHXL",
   "description": "XCrawl is a Nodejs multifunctional crawler library.",
   "license": "MIT",
diff --git a/publish/README.md b/publish/README.md
@@ -6,13 +6,19 @@ x-crawl is a Nodejs multifunctional crawler library.
 
 ## Feature
 
-- Crawl HTML, JSON, file resources, etc. with simple configuration
-- Use puppeteer to crawl HTML, and use JSDOM library to parse HTML, or parse HTML by yourself
-- Support asynchronous/synchronous way to crawl data
-- Support Promise/Callback way to get the result
-- Polling function
-- Anthropomorphic request interval
-- Written in TypeScript, provides generics
+- Crawl HTML, JSON, file resources, etc. with simple configuration.
+- Built-in puppeteer crawls HTML and uses JSDOM library to parse HTML.
+- Support asynchronous/synchronous way to crawl data.
+- Support Promise/Callback way to get the result.
+- Polling function.
+- Anthropomorphic request interval.
+- Written in TypeScript, provides generics.
+
+## Benefits provided by using puppeter
+
+- Generate screenshots and PDFs of pages.
+- Crawl a SPA (Single-Page Application) and generate pre-rendered content (i.e. "SSR" (Server-Side Rendering)).
+- Automate form submission, UI testing, keyboard input, etc.
 
 # Table of Contents
 
@@ -41,14 +47,15 @@ x-crawl is a Nodejs multifunctional crawler library.
     * [Method](#Method)
     * [RequestConfig](#RequestConfig)
     * [IntervalTime](#IntervalTime)
-    * [FetchBaseConifg](#FetchBaseConifg)
     * [XCrawlBaseConifg](#XCrawlBaseConifg)
+    * [FetchBaseConifgV1](#FetchBaseConifgV1)
+    * [FetchBaseConifgV2](#FetchBaseConifgV2)
     * [FetchHTMLConfig](#FetchHTMLConfig	)
-    * [FetchDataConfig](#FetchDataConfig)   
+    * [FetchDataConfig](#FetchDataConfig) 
     * [FetchFileConfig](#FetchFileConfig)
     * [StartPollingConfig](#StartPollingConfig)
-    * [FetchCommon](#FetchCommon)
-    * [FetchCommonArr](#FetchCommonArr)
+    * [FetchResCommonV1](#FetchResCommonV1)
+    * [FetchResCommonArrV1](#FetchResCommonArrV1)
     * [FileInfo](#FileInfo)
     * [FetchHTML](#FetchHTML)
 - [More](#More)
@@ -318,7 +325,6 @@ interface FetchBaseConifgV1 {
 ```ts
 interface FetchBaseConifgV2 {
   url: string
-  header?: AnyObject
   timeout?: number
   proxy?: string
 }
@@ -364,7 +370,7 @@ interface StartPollingConfig {
 interface FetchCommon<T> {
   id: number
   statusCode: number | undefined
-  headers: IncomingHttpHeaders // node: http type
+  headers: IncomingHttpHeaders // nodejs: http type
   data: T
 }
 ```
@@ -392,8 +398,7 @@ interface FileInfo {
 interface FetchHTML {
   httpResponse: HTTPResponse | null // The type of HTTPResponse in the puppeteer library
   data: {
-    page: Page
-    content: string
+    page: Page // The type of Page in the puppeteer library
     jsdom: JSDOM // The type of JSDOM in the jsdom library
   }
 }
diff --git a/publish/package.json b/publish/package.json
@@ -1,6 +1,6 @@
 {
   "name": "x-crawl",
-  "version": "2.0.0",
+  "version": "2.1.0",
   "author": "coderHXL",
   "description": "XCrawl is a Nodejs multifunctional crawler library.",
   "license": "MIT",
diff --git a/test/start/index.js b/test/start/index.js
diff --git a/test/start/index.ts b/test/start/index.ts

Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,7 @@`
`1`	`1`	`{`
`2`	`2`	`"private": true,`
`3`	`3`	`"name": "x-crawl",`
`4`		`- "version": "2.0.0",`
	`4`	`+ "version": "2.1.0",`
`5`	`5`	`"author": "coderHXL",`
`6`	`6`	`"description": "XCrawl is a Nodejs multifunctional crawler library.",`
`7`	`7`	`"license": "MIT",`
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"name": "x-crawl",`
`3`		`- "version": "2.0.0",`
	`3`	`+ "version": "2.1.0",`
`4`	`4`	`"author": "coderHXL",`
`5`	`5`	`"description": "XCrawl is a Nodejs multifunctional crawler library.",`
`6`	`6`	`"license": "MIT",`