Skip to content

Commit 408e85a

Browse files
committed
other
1 parent def8748 commit 408e85a

File tree

8 files changed

+134
-87
lines changed

8 files changed

+134
-87
lines changed

README.md

+29-21
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,20 @@
22

33
English | [简体中文](https://github.com/coder-hxl/x-crawl/blob/main/document/cn.md)
44

5-
XCrawl is a Nodejs multifunctional crawler library. Crawl HTML, JSON, file resources, etc. through simple configuration.
5+
XCrawl is a Nodejs multifunctional crawler library.
66

7-
## highlights
7+
## Feature
88

9-
- Simple configuration to grab HTML, JSON, file resources, etc
10-
- Batch requests can choose mode asynchronous or synchronous
11-
- polling function
9+
- Crawl HTML, JSON, file resources, etc. with simple configuration
10+
- Use the JSDOM library to parse HTML, or parse HTML by yourself
11+
- Optional mode asynchronous/synchronous for batch requests
12+
- Polling function
1213
- Anthropomorphic request interval
14+
- Written in TypeScript
15+
16+
## Catalog
17+
18+
[TOC]
1319

1420
## Install
1521

@@ -174,7 +180,7 @@ myXCrawl.fetchFile({
174180
175181
fetchPolling is a method of the [myXCrawl](https://github.com/coder-hxl/x-crawl#Example-1) instance, typically used to perform polling operations, such as getting news every once in a while.
176182
177-
#### 类型
183+
#### Type
178184
179185
```ts
180186
function fetchPolling(
@@ -183,7 +189,7 @@ function fetchPolling(
183189
): void
184190
```
185191

186-
#### 示例
192+
#### Example
187193

188194
```js
189195
myXCrawl.fetchPolling({ h: 1, m: 30 }, () => {
@@ -194,21 +200,21 @@ myXCrawl.fetchPolling({ h: 1, m: 30 }, () => {
194200
195201
## Types
196202
197-
#### IAnyObject
203+
### IAnyObject
198204
199205
```ts
200206
interface IAnyObject extends Object {
201207
[key: string | number | symbol]: any
202208
}
203209
```
204210
205-
#### IMethod
211+
### IMethod
206212
207213
```ts
208214
type IMethod = 'get' | 'GET' | 'delete' | 'DELETE' | 'head' | 'HEAD' | 'options' | 'OPTIONS' | 'post' | 'POST' | 'put' | 'PUT' | 'patch' | 'PATCH' | 'purge' | 'PURGE' | 'link' | 'LINK' | 'unlink' | 'UNLINK'
209215
```
210216
211-
#### IRequestConfig
217+
### IRequestConfig
212218
213219
```ts
214220
interface IRequestConfig {
@@ -218,10 +224,11 @@ interface IRequestConfig {
218224
params?: IAnyObject
219225
data?: any
220226
timeout?: number
227+
proxy?: string
221228
}
222229
```
223230
224-
#### IIntervalTime
231+
### IIntervalTime
225232
226233
```ts
227234
type IIntervalTime = number | {
@@ -230,7 +237,7 @@ type IIntervalTime = number | {
230237
}
231238
```
232239
233-
#### IFetchBaseConifg
240+
### IFetchBaseConifg
234241
235242
```ts
236243
interface IFetchBaseConifg {
@@ -239,31 +246,32 @@ interface IFetchBaseConifg {
239246
}
240247
```
241248
242-
#### IXCrawlBaseConifg
249+
### IXCrawlBaseConifg
243250
244251
```ts
245252
interface IXCrawlBaseConifg {
246253
baseUrl?: string
247254
timeout?: number
248255
intervalTime?: IIntervalTime
249256
mode?: 'async' | 'sync'
257+
proxy?: string
250258
}
251259
```
252260
253-
#### IFetchHTMLConfig
261+
### IFetchHTMLConfig
254262
255263
```ts
256264
type IFetchHTMLConfig = string | IRequestConfig
257265
```
258266
259-
#### IFetchDataConfig
267+
### IFetchDataConfig
260268
261269
```ts
262270
interface IFetchDataConfig extends IFetchBaseConifg {
263271
}
264272
```
265273
266-
#### IFetchFileConfig
274+
### IFetchFileConfig
267275
268276
```ts
269277
interface IFetchFileConfig extends IFetchBaseConifg {
@@ -273,7 +281,7 @@ interface IFetchFileConfig extends IFetchBaseConifg {
273281
}
274282
```
275283
276-
#### IFetchPollingConfig
284+
### IFetchPollingConfig
277285
278286
```ts
279287
interface IFetchPollingConfig {
@@ -285,7 +293,7 @@ interface IFetchPollingConfig {
285293
}
286294
```
287295
288-
#### IFetchCommon
296+
### IFetchCommon
289297
290298
```ts
291299
type IFetchCommon<T> = {
@@ -296,7 +304,7 @@ type IFetchCommon<T> = {
296304
}[]
297305
```
298306
299-
#### IFileInfo
307+
### IFileInfo
300308
301309
```ts
302310
interface IFileInfo {
@@ -307,14 +315,14 @@ interface IFileInfo {
307315
}
308316
```
309317
310-
#### IFetchHTML
318+
### IFetchHTML
311319
312320
```ts
313321
interface IFetchHTML {
314322
statusCode: number | undefined
315323
headers: IncomingHttpHeaders
316324
data: {
317-
raw: string // HTML String
325+
html: string // HTML String
318326
jsdom: JSDOM // HTML parsing using the jsdom library
319327
}
320328
}

document/cn.md

+26-18
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,20 @@
22

33
[English](https://github.com/coder-hxl/x-crawl#x-crawl) | 简体中文
44

5-
XCrawl 是 Nodejs 多功能爬虫库。只需简单的配置即可抓取 HTML 、JSON、文件资源等等。
5+
XCrawl 是 Nodejs 多功能爬虫库。
66

7-
## 亮点
7+
## 特点
88

9-
- 简单的配置即可抓取 HTML 、JSON 、文件资源等等
10-
- 批量请求可选择模式 异步 或 同步
9+
- 只需简单的配置即可抓取 HTML 、JSON、文件资源等等
10+
- 使用 JSDOM 库对 HTML 解析,也可自行解析 HTML
11+
- 批量请求时可选择模式 异步/同步
1112
- 轮询功能
1213
- 拟人化的请求间隔时间
14+
- 使用 TypeScript 编写
15+
16+
## 目录
17+
18+
[TOC]
1319

1420
## 安装
1521

@@ -206,21 +212,21 @@ myXCrawl.fetchPolling({ h: 1, m: 30 }, () => {
206212
207213
## 类型
208214
209-
#### IAnyObject
215+
### IAnyObject
210216
211217
```ts
212218
interface IAnyObject extends Object {
213219
[key: string | number | symbol]: any
214220
}
215221
```
216222
217-
#### IMethod
223+
### IMethod
218224
219225
```ts
220226
type IMethod = 'get' | 'GET' | 'delete' | 'DELETE' | 'head' | 'HEAD' | 'options' | 'OPTIONS' | 'post' | 'POST' | 'put' | 'PUT' | 'patch' | 'PATCH' | 'purge' | 'PURGE' | 'link' | 'LINK' | 'unlink' | 'UNLINK'
221227
```
222228
223-
#### IRequestConfig
229+
### IRequestConfig
224230
225231
```ts
226232
interface IRequestConfig {
@@ -230,10 +236,11 @@ interface IRequestConfig {
230236
params?: IAnyObject
231237
data?: any
232238
timeout?: number
239+
proxy?: string
233240
}
234241
```
235242
236-
#### IIntervalTime
243+
### IIntervalTime
237244
238245
```ts
239246
type IIntervalTime = number | {
@@ -242,7 +249,7 @@ type IIntervalTime = number | {
242249
}
243250
```
244251
245-
#### IFetchBaseConifg
252+
### IFetchBaseConifg
246253
247254
```ts
248255
interface IFetchBaseConifg {
@@ -251,31 +258,32 @@ interface IFetchBaseConifg {
251258
}
252259
```
253260
254-
#### IXCrawlBaseConifg
261+
### IXCrawlBaseConifg
255262
256263
```ts
257264
interface IXCrawlBaseConifg {
258265
baseUrl?: string
259266
timeout?: number
260267
intervalTime?: IIntervalTime
261268
mode?: 'async' | 'sync'
269+
proxy?: string
262270
}
263271
```
264272
265-
#### IFetchHTMLConfig
273+
### IFetchHTMLConfig
266274
267275
```ts
268276
type IFetchHTMLConfig = string | IRequestConfig
269277
```
270278
271-
#### IFetchDataConfig
279+
### IFetchDataConfig
272280
273281
```ts
274282
interface IFetchDataConfig extends IFetchBaseConifg {
275283
}
276284
```
277285
278-
#### IFetchFileConfig
286+
### IFetchFileConfig
279287
280288
```ts
281289
interface IFetchFileConfig extends IFetchBaseConifg {
@@ -285,7 +293,7 @@ interface IFetchFileConfig extends IFetchBaseConifg {
285293
}
286294
```
287295
288-
#### IFetchPollingConfig
296+
### IFetchPollingConfig
289297
290298
```ts
291299
interface IFetchPollingConfig {
@@ -297,7 +305,7 @@ interface IFetchPollingConfig {
297305
}
298306
```
299307
300-
#### IFetchCommon
308+
### IFetchCommon
301309
302310
```ts
303311
type IFetchCommon<T> = {
@@ -308,7 +316,7 @@ type IFetchCommon<T> = {
308316
}[]
309317
```
310318
311-
#### IFileInfo
319+
### IFileInfo
312320
313321
```ts
314322
interface IFileInfo {
@@ -319,14 +327,14 @@ interface IFileInfo {
319327
}
320328
```
321329
322-
#### IFetchHTML
330+
### IFetchHTML
323331
324332
```ts
325333
interface IFetchHTML {
326334
statusCode: number | undefined
327335
headers: IncomingHttpHeaders
328336
data: {
329-
raw: string // HTML String
337+
html: string // HTML String
330338
jsdom: JSDOM // 使用了 jsdom 库对 HTML 解析
331339
}
332340
}

package.json

+3-2
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
{
22
"private": true,
33
"name": "x-crawl",
4-
"version": "0.2.0",
4+
"version": "0.3.0",
55
"author": "CoderHxl",
6-
"description": "XCrawl is a Nodejs multifunctional crawler library. Crawl HTML, JSON, file resources, etc. through simple configuration.",
6+
"description": "XCrawl is a Nodejs multifunctional crawler library.",
77
"license": "MIT",
88
"main": "src/index.ts",
99
"scripts": {
@@ -16,6 +16,7 @@
1616
},
1717
"dependencies": {
1818
"chalk": "4.1.2",
19+
"https-proxy-agent": "^5.0.1",
1920
"jsdom": "^21.1.0",
2021
"x-crawl": "link:"
2122
},

pnpm-lock.yaml

+2
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)