Skip to content

Commit def8748

Browse files
committed
Add proxy option / Rename the data object raw of fetchHTML return value to html
1 parent d46b0c0 commit def8748

File tree

3 files changed

+57
-39
lines changed

3 files changed

+57
-39
lines changed

src/index.ts

+12-5
Original file line numberDiff line numberDiff line change
@@ -38,22 +38,29 @@ export default class XCrawl {
3838

3939
private mergeConfig<T extends IFetchBaseConifg>(rawConfig: T): T {
4040
const baseConfig = this.baseConfig
41-
const newConfig: T = structuredClone(rawConfig)
41+
const newConfig = structuredClone(rawConfig)
4242

4343
// 1.处理 requestConifg
4444
const requestConifgArr = isArray(newConfig.requestConifg)
4545
? newConfig.requestConifg
4646
: [newConfig.requestConifg]
4747
for (const requestItem of requestConifgArr) {
48-
const { url, timeout } = requestItem
48+
const { url, timeout, proxy } = requestItem
4949

50+
// 1.1.baseUrl
5051
if (!isUndefined(baseConfig.baseUrl)) {
5152
requestItem.url = baseConfig.baseUrl + url
5253
}
5354

55+
// 1.2.timeout
5456
if (isUndefined(timeout)) {
5557
requestItem.timeout = baseConfig.timeout
5658
}
59+
60+
// 1.3.porxy
61+
if (isUndefined(proxy)) {
62+
requestItem.proxy = baseConfig.proxy
63+
}
5764
}
5865

5966
// 2.处理 intervalTime
@@ -88,13 +95,13 @@ export default class XCrawl {
8895
})
8996

9097
const requestRes = await request(requestConifg)
91-
const rawData = requestRes.data.toString()
98+
const html = requestRes.data.toString()
9299

93100
const res: IFetchHTML = {
94101
...requestRes,
95102
data: {
96-
raw: rawData,
97-
jsdom: new JSDOM(rawData)
103+
html,
104+
jsdom: new JSDOM(html)
98105
}
99106
}
100107

src/request.ts

+42-33
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
1-
import http, { Agent, RequestOptions } from 'node:http'
2-
import { Agent as httpsAgent } from 'https'
1+
import http, { RequestOptions, IncomingMessage, ClientRequest } from 'node:http'
2+
import https from 'node:https'
33
import Url, { URL } from 'node:url'
4+
import HttpsProxyAgent from 'https-proxy-agent'
45

56
import {
67
isNumber,
@@ -62,8 +63,15 @@ function handleRequestConfig(
6263
const { protocol, hostname, port, pathname, search } = new Url.URL(
6364
rawConfig.url
6465
)
66+
const isHttp = protocol === 'http:'
6567

6668
const config: RequestOptions & IMapTypeEmptyObject<URL> = {
69+
agent: rawConfig.proxy
70+
? HttpsProxyAgent(rawConfig.proxy)
71+
: isHttp
72+
? new http.Agent()
73+
: new https.Agent(),
74+
6775
protocol,
6876
hostname,
6977
port,
@@ -77,46 +85,17 @@ function handleRequestConfig(
7785

7886
config.headers = parseHeaders(rawConfig, config)
7987

80-
if (protocol === 'http:') {
81-
config.agent = new Agent()
82-
} else {
83-
config.agent = new httpsAgent()
84-
}
85-
8688
return config
8789
}
8890

89-
async function useSleepByBatch(
90-
isHaveIntervalTime: boolean,
91-
isNumberIntervalTime: boolean,
92-
intervalTime: any,
93-
id: number
94-
) {
95-
if (isHaveIntervalTime && id > 1) {
96-
const timeout: number = isNumberIntervalTime
97-
? intervalTime
98-
: random(intervalTime.max, intervalTime.min)
99-
100-
log(
101-
`Request ${logNumber(id)} needs to sleep for ${logNumber(
102-
timeout + 'ms'
103-
)} milliseconds before sending`
104-
)
105-
106-
await sleep(timeout)
107-
} else {
108-
log(`Request ${logNumber(id)} does not need to sleep, send immediately`)
109-
}
110-
}
111-
11291
export function request(config: IRequestConfig) {
11392
return new Promise<IRequest>((resolve, reject) => {
11493
const isDataUndefine = isUndefined(config.data)
11594
config.data = !isDataUndefine ? JSON.stringify(config.data) : config.data
11695

11796
const requestConfig = handleRequestConfig(config)
11897

119-
const req = http.request(requestConfig, (res) => {
98+
function handleRes(res: IncomingMessage) {
12099
const { statusCode, headers } = res
121100

122101
const container: Buffer[] = []
@@ -133,7 +112,14 @@ export function request(config: IRequestConfig) {
133112

134113
resolve(resolveRes)
135114
})
136-
})
115+
}
116+
117+
let req: ClientRequest
118+
if (requestConfig.protocol === 'http:') {
119+
req = http.request(requestConfig, handleRes)
120+
} else {
121+
req = https.request(requestConfig, handleRes)
122+
}
137123

138124
req.on('timeout', () => {
139125
reject(new Error(`Timeout ${config.timeout}ms`))
@@ -152,6 +138,29 @@ export function request(config: IRequestConfig) {
152138
})
153139
}
154140

141+
async function useSleepByBatch(
142+
isHaveIntervalTime: boolean,
143+
isNumberIntervalTime: boolean,
144+
intervalTime: any,
145+
id: number
146+
) {
147+
if (isHaveIntervalTime && id > 1) {
148+
const timeout: number = isNumberIntervalTime
149+
? intervalTime
150+
: random(intervalTime.max, intervalTime.min)
151+
152+
log(
153+
`Request ${logNumber(id)} needs to sleep for ${logNumber(
154+
timeout + 'ms'
155+
)} milliseconds before sending`
156+
)
157+
158+
await sleep(timeout)
159+
} else {
160+
log(`Request ${logNumber(id)} does not need to sleep, send immediately`)
161+
}
162+
}
163+
155164
export async function batchRequest(
156165
requestConifgs: IRequestConfig[],
157166
intervalTime: IIntervalTime | undefined

src/types.ts

+3-1
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ export interface IRequestConfig {
5252
params?: IAnyObject
5353
data?: any
5454
timeout?: number
55+
proxy?: string
5556
}
5657

5758
export type IIntervalTime = number | { max: number; min?: number }
@@ -61,6 +62,7 @@ export interface IXCrawlBaseConifg {
6162
timeout?: number
6263
intervalTime?: IIntervalTime
6364
mode?: 'async' | 'sync'
65+
proxy?: string
6466
}
6567

6668
export interface IFetchBaseConifg {
@@ -104,7 +106,7 @@ export interface IFetchHTML {
104106
statusCode: number | undefined
105107
headers: IncomingHttpHeaders
106108
data: {
107-
raw: string
109+
html: string
108110
jsdom: JSDOM
109111
}
110112
}

0 commit comments

Comments
 (0)