Skip to content

Commit 05371d9

Browse files
committed
type adjustment - startPolling cancels year and month - fetchHTML only creates a browser once
1 parent 72a4071 commit 05371d9

File tree

14 files changed

+624
-652
lines changed

14 files changed

+624
-652
lines changed

README.md

+106-119
Large diffs are not rendered by default.

docs/cn.md

+103-116
Large diffs are not rendered by default.

publish/README.md

+107-120
Large diffs are not rendered by default.

publish/package.json

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "x-crawl",
3-
"version": "1.1.1",
3+
"version": "2.0.0",
44
"author": "coderHXL",
55
"description": "XCrawl is a Nodejs multifunctional crawler library.",
66
"license": "MIT",
@@ -28,6 +28,7 @@
2828
"dependencies": {
2929
"chalk": "4.1.2",
3030
"https-proxy-agent": "^5.0.1",
31+
"puppeteer": "^19.7.2",
3132
"jsdom": "^21.1.0",
3233
"@types/jsdom": "^20.0.1"
3334
}

src/api.ts

+74-68
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ import fs from 'node:fs'
22
import { writeFile } from 'node:fs/promises'
33
import path from 'node:path'
44
import { JSDOM } from 'jsdom'
5-
import puppeteer, { Browser, Page } from 'puppeteer'
5+
import puppeteer, { Browser } from 'puppeteer'
66

77
import { batchRequest, syncBatchRequest } from './request'
88
import { quickSort } from './sort'
@@ -18,23 +18,22 @@ import {
1818
} from './utils'
1919

2020
import {
21-
IXCrawlBaseConifg,
22-
IFetchHTMLConfig,
23-
IFetchDataConfig,
24-
IFetchFileConfig,
25-
IStartPollingConfig,
26-
IFetchBaseConifg,
27-
IFileInfo,
28-
IFetchHTML,
29-
IRequestResItem,
30-
IRequestConfig,
31-
IIntervalTime,
32-
IFetchCommon,
33-
IFetchCommonArr
34-
} from './types'
35-
36-
function mergeConfig<T extends IFetchBaseConifg>(
37-
baseConfig: IXCrawlBaseConifg,
21+
FetchBaseConifgV1,
22+
FetchDataConfig,
23+
FetchFileConfig,
24+
FetchHTML,
25+
FetchHTMLConfig,
26+
FetchResCommonArrV1,
27+
FetchResCommonV1,
28+
FileInfo,
29+
IntervalTime,
30+
StartPollingConfig
31+
} from './types/api'
32+
import { XCrawlBaseConifg } from './types'
33+
import { RequestConfig, RequestResItem } from './types/request'
34+
35+
function mergeConfig<T extends FetchBaseConifgV1>(
36+
baseConfig: XCrawlBaseConifg,
3837
rawConfig: T
3938
): T {
4039
const newConfig = structuredClone(rawConfig)
@@ -43,22 +42,22 @@ function mergeConfig<T extends IFetchBaseConifg>(
4342
const requestConifgArr = isArray(newConfig.requestConifg)
4443
? newConfig.requestConifg
4544
: [newConfig.requestConifg]
46-
for (const requestItem of requestConifgArr) {
47-
const { url, timeout, proxy } = requestItem
45+
for (const requesttem of requestConifgArr) {
46+
const { url, timeout, proxy } = requesttem
4847

4948
// 1.1.baseUrl
5049
if (!isUndefined(baseConfig.baseUrl)) {
51-
requestItem.url = baseConfig.baseUrl + url
50+
requesttem.url = baseConfig.baseUrl + url
5251
}
5352

5453
// 1.2.timeout
5554
if (isUndefined(timeout)) {
56-
requestItem.timeout = baseConfig.timeout
55+
requesttem.timeout = baseConfig.timeout
5756
}
5857

5958
// 1.3.porxy
6059
if (isUndefined(proxy)) {
61-
requestItem.proxy = baseConfig.proxy
60+
requesttem.proxy = baseConfig.proxy
6261
}
6362
}
6463

@@ -72,9 +71,9 @@ function mergeConfig<T extends IFetchBaseConifg>(
7271

7372
async function useBatchRequestByMode(
7473
mode: 'async' | 'sync' | undefined,
75-
requestConifg: IRequestConfig | IRequestConfig[],
76-
intervalTime: IIntervalTime | undefined,
77-
callback: (requestResItem: IRequestResItem) => void
74+
requestConifg: RequestConfig | RequestConfig[],
75+
intervalTime: IntervalTime | undefined,
76+
callback: (requestRestem: RequestResItem) => void
7877
) {
7978
const requestConfigQueue = isArray(requestConifg)
8079
? requestConifg
@@ -87,25 +86,33 @@ async function useBatchRequestByMode(
8786
}
8887
}
8988

90-
export function createFetchHTML(baseConfig: IXCrawlBaseConifg) {
91-
// 初始值
89+
export function createFetchHTML(baseConfig: XCrawlBaseConifg) {
9290
let browser: Browser | null = null
93-
let page: Page | null = null
94-
let useTotal = 0
91+
let createBrowserState: Promise<void> | null = null
92+
let callTotal = 0
9593

9694
async function fetchHTML(
97-
config: IFetchHTMLConfig,
98-
callback?: (res: IFetchHTML) => void
99-
): Promise<IFetchHTML> {
100-
// 完成初始化
101-
if (useTotal === 0) {
102-
browser = await puppeteer.launch()
103-
page = await browser.newPage()
104-
await page.setViewport({ width: 1280, height: 1024 })
95+
config: FetchHTMLConfig,
96+
callback?: (res: FetchHTML) => void
97+
): Promise<FetchHTML> {
98+
// 记录调用次数, 为关闭浏览器
99+
callTotal++
100+
101+
// 只创建一次浏览器
102+
if (callTotal === 1) {
103+
createBrowserState = puppeteer.launch().then((res) => {
104+
browser = res
105+
})
106+
}
107+
108+
// 等待浏览器创建完毕
109+
if (createBrowserState) {
110+
await Promise.all([createBrowserState])
111+
createBrowserState = null
105112
}
106113

107-
// 记录调用次数
108-
useTotal++
114+
const page = await browser!.newPage()
115+
await page.setViewport({ width: 1280, height: 1024 })
109116

110117
const { requestConifg } = mergeConfig(baseConfig, {
111118
requestConifg: isString(config) ? { url: config } : config
@@ -127,13 +134,14 @@ export function createFetchHTML(baseConfig: IXCrawlBaseConifg) {
127134
const content = await page!.content()
128135

129136
// 关闭浏览器
130-
if (--useTotal === 0) {
137+
if (--callTotal === 0) {
131138
await browser!.close()
132139
}
133140

134-
const res: IFetchHTML = {
141+
const res: FetchHTML = {
135142
httpResponse,
136143
data: {
144+
page,
137145
content,
138146
jsdom: new JSDOM(content)
139147
}
@@ -149,23 +157,23 @@ export function createFetchHTML(baseConfig: IXCrawlBaseConifg) {
149157
return fetchHTML
150158
}
151159

152-
export function createFetchData(baseConfig: IXCrawlBaseConifg) {
160+
export function createFetchData(baseConfig: XCrawlBaseConifg) {
153161
async function fetchData<T = any>(
154-
config: IFetchDataConfig,
155-
callback?: (res: IFetchCommon<T>) => void
156-
): Promise<IFetchCommonArr<T>> {
162+
config: FetchDataConfig,
163+
callback?: (res: FetchResCommonV1<T>) => void
164+
): Promise<FetchResCommonArrV1<T>> {
157165
const { requestConifg, intervalTime } = mergeConfig(baseConfig, config)
158166

159-
const container: IFetchCommonArr<T> = []
160-
function handleResItem(requestResItem: IRequestResItem) {
161-
const contentType = requestResItem.headers['content-type'] ?? ''
162-
const rawData = requestResItem.data
167+
const container: FetchResCommonArrV1<T> = []
168+
function handleRestem(requestRestem: RequestResItem) {
169+
const contentType = requestRestem.headers['content-type'] ?? ''
170+
const rawData = requestRestem.data
163171

164172
const data = contentType.includes('text')
165173
? rawData.toString()
166174
: JSON.parse(rawData.toString())
167175

168-
const itemRes = { ...requestResItem, data }
176+
const itemRes = { ...requestRestem, data }
169177

170178
if (callback) {
171179
callback(itemRes)
@@ -178,7 +186,7 @@ export function createFetchData(baseConfig: IXCrawlBaseConifg) {
178186
baseConfig.mode,
179187
requestConifg,
180188
intervalTime,
181-
handleResItem
189+
handleRestem
182190
)
183191

184192
const res = quickSort(
@@ -190,26 +198,26 @@ export function createFetchData(baseConfig: IXCrawlBaseConifg) {
190198
return fetchData
191199
}
192200

193-
export function createFetchFile(baseConfig: IXCrawlBaseConifg) {
201+
export function createFetchFile(baseConfig: XCrawlBaseConifg) {
194202
async function fetchFile(
195-
config: IFetchFileConfig,
196-
callback?: (res: IFetchCommon<IFileInfo>) => void
197-
): Promise<IFetchCommonArr<IFileInfo>> {
203+
config: FetchFileConfig,
204+
callback?: (res: FetchResCommonV1<FileInfo>) => void
205+
): Promise<FetchResCommonArrV1<FileInfo>> {
198206
const { requestConifg, intervalTime, fileConfig } = mergeConfig(
199207
baseConfig,
200208
config
201209
)
202210

203-
const container: IFetchCommonArr<IFileInfo> = []
211+
const container: FetchResCommonArrV1<FileInfo> = []
204212
const saveFileArr: Promise<void>[] = []
205213
const saveFileErrorArr: { message: string; valueOf: () => number }[] = []
206214

207215
if (!fs.existsSync(fileConfig.storeDir)) {
208216
fs.mkdirSync(fileConfig.storeDir)
209217
}
210218

211-
function handleResItem(requestResItem: IRequestResItem) {
212-
const { id, headers, data } = requestResItem
219+
function handleRestem(requestRestem: RequestResItem) {
220+
const { id, headers, data } = requestRestem
213221

214222
const mimeType = headers['content-type'] ?? ''
215223
const fileExtension = fileConfig.extension ?? mimeType.split('/').pop()
@@ -219,7 +227,7 @@ export function createFetchFile(baseConfig: IXCrawlBaseConifg) {
219227
`${fileName}.${fileExtension}`
220228
)
221229

222-
const saveFileItem = writeFile(filePath, data)
230+
const saveFiletem = writeFile(filePath, data)
223231
.catch((err) => {
224232
const message = `File save error at id ${id}: ${err.message}`
225233
const valueOf = () => id
@@ -232,7 +240,7 @@ export function createFetchFile(baseConfig: IXCrawlBaseConifg) {
232240
if (isError) return
233241

234242
const res = {
235-
...requestResItem,
243+
...requestRestem,
236244
data: { fileName, mimeType, size: data.length, filePath }
237245
}
238246

@@ -243,14 +251,14 @@ export function createFetchFile(baseConfig: IXCrawlBaseConifg) {
243251
container.push(res)
244252
})
245253

246-
saveFileArr.push(saveFileItem)
254+
saveFileArr.push(saveFiletem)
247255
}
248256

249257
await useBatchRequestByMode(
250258
baseConfig.mode,
251259
requestConifg,
252260
intervalTime,
253-
handleResItem
261+
handleRestem
254262
)
255263

256264
// 等待保存文件任务完成
@@ -280,17 +288,15 @@ export function createFetchFile(baseConfig: IXCrawlBaseConifg) {
280288
}
281289

282290
export function startPolling(
283-
config: IStartPollingConfig,
291+
config: StartPollingConfig,
284292
callback: (count: number) => void
285293
) {
286-
const { Y, M, d, h, m } = config
294+
const { d, h, m } = config
287295

288-
const year = !isUndefined(Y) ? Y * 1000 * 60 * 60 * 24 * 365 : 0
289-
const month = !isUndefined(M) ? M * 1000 * 60 * 60 * 24 * 30 : 0
290296
const day = !isUndefined(d) ? d * 1000 * 60 * 60 * 24 : 0
291297
const hour = !isUndefined(h) ? h * 1000 * 60 * 60 : 0
292298
const minute = !isUndefined(m) ? m * 1000 * 60 : 0
293-
const total = year + month + day + hour + minute
299+
const total = day + hour + minute
294300

295301
let count = 0
296302
function startCallback() {

src/index.ts

+6-6
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,10 @@ import {
55
startPolling
66
} from './api'
77

8-
import { IXCrawlInstance, IXCrawlBaseConifg } from './types'
8+
import { XCrawlBaseConifg, XCrawlInstance } from './types'
99

10-
function createInstance(baseConfig: IXCrawlBaseConifg): IXCrawlInstance {
11-
const instance: IXCrawlInstance = {
10+
function createnstance(baseConfig: XCrawlBaseConifg): XCrawlInstance {
11+
const instance: XCrawlInstance = {
1212
fetchHTML: createFetchHTML(baseConfig),
1313
fetchData: createFetchData(baseConfig),
1414
fetchFile: createFetchFile(baseConfig),
@@ -19,9 +19,9 @@ function createInstance(baseConfig: IXCrawlBaseConifg): IXCrawlInstance {
1919
}
2020

2121
export default function xCrawl(
22-
baseConfig: IXCrawlBaseConifg = {}
23-
): IXCrawlInstance {
24-
const instance = createInstance(baseConfig)
22+
baseConfig: XCrawlBaseConifg = {}
23+
): XCrawlInstance {
24+
const instance = createnstance(baseConfig)
2525

2626
return instance
2727
}

0 commit comments

Comments
 (0)