Skip to content

Commit 67e25fa

Browse files
committed
Update: crawlPage API allows crawling pages with cookies
1 parent a063fd1 commit 67e25fa

File tree

9 files changed

+101
-36
lines changed

9 files changed

+101
-36
lines changed

README.md

+23-13
Original file line numberDiff line numberDiff line change
@@ -64,11 +64,13 @@ The crawlPage API internally uses the [puppeteer](https://github.com/puppeteer/p
6464
- [RequestConfig](#RequestConfig)
6565
- [IntervalTime](#IntervalTime)
6666
- [XCrawlBaseConfig](#XCrawlBaseConfig)
67-
- [CrawlPageConfig](#CrawlPageConfig)
6867
- [CrawlBaseConfigV1](#CrawlBaseConfigV1)
68+
- [CrawlBaseConfigV2](#CrawlBaseConfigV2)
69+
- [CrawlPageConfig](#CrawlPageConfig)
6970
- [CrawlDataConfig](#CrawlDataConfig)
7071
- [CrawlFileConfig](#CrawlFileConfig)
7172
- [StartPollingConfig](#StartPollingConfig)
73+
- [XCrawlInstance](#XCrawlInstance)
7274
- [CrawlResCommonV1](#CrawlResCommonV1)
7375
- [CrawlResCommonArrV1](#CrawlResCommonArrV1)
7476
- [CrawlPage](#CrawlPage-1)
@@ -747,34 +749,42 @@ interface XCrawlBaseConfig {
747749
}
748750
```
749751
750-
### CrawlPageConfig
752+
### CrawlBaseConfigV1
751753
752754
```ts
753-
type CrawlPageConfig = string | RequestConfigObjectV1
755+
interface CrawlBaseConfigV1 extends RequestConfigObjectV1 {
756+
cookies?: string | Protocol.Network.CookieParam | Protocol.Network.CookieParam[] // The Protocol is from the puppeteer library
757+
}
754758
```
755759
756-
### CrawlBaseConfigV1
760+
### CrawlBaseConfigV2
757761
758762
```ts
759-
interface CrawlBaseConfigV1 {
763+
interface CrawlBaseConfigV2 {
760764
requestConfig: RequestConfig | RequestConfig[]
761765
intervalTime?: IntervalTime
762766
}
763767
```
764768
769+
### CrawlPageConfig
770+
771+
```ts
772+
type CrawlPageConfig = string | CrawlBaseConfigV1
773+
```
774+
765775
### CrawlDataConfig
766776
767777
```ts
768-
interface CrawlDataConfig extends CrawlBaseConfigV1 {}
778+
interface CrawlDataConfig extends CrawlBaseConfigV2 {}
769779
```
770780
771781
### CrawlFileConfig
772782
773783
```ts
774-
interface CrawlFileConfig extends CrawlBaseConfigV1 {
784+
interface CrawlFileConfig extends CrawlBaseConfigV2 {
775785
fileConfig: {
776786
storeDir: string // Store folder
777-
extension?: string // Filename extension
787+
extension?: string // filename extension
778788
}
779789
}
780790
```
@@ -821,7 +831,7 @@ interface XCrawlInstance {
821831
interface CrawlResCommonV1<T> {
822832
id: number
823833
statusCode: number | undefined
824-
headers: IncomingHttpHeaders // nodejs: http type
834+
headers: IncomingHttpHeaders // The http is from the nodejs library
825835
data: T
826836
}
827837
```
@@ -836,10 +846,10 @@ type CrawlResCommonArrV1<T> = CrawlResCommonV1<T>[]
836846
837847
```ts
838848
interface CrawlPage {
839-
httpResponse: HTTPResponse | null // The type of HTTPResponse in the puppeteer library
840-
browser: Browser // The Browser type of the puppeteer library
841-
page: Page // The Page type of the puppeteer library
842-
jsdom: JSDOM // jsdom type of the JSDOM library
849+
httpResponse: HTTPResponse | null // The HTTPResponse is from the puppeteer library
850+
browser: Browser // The Browser is from the puppeteer library
851+
page: Page // The Page is from the puppeteer library
852+
jsdom: JSDOM // The JSDOM is from the jsdom library
843853
}
844854
```
845855

docs/cn.md

+17-7
Original file line numberDiff line numberDiff line change
@@ -64,11 +64,13 @@ crawlPage API 内部使用 [puppeteer](https://github.com/puppeteer/puppeteer)
6464
- [RequestConfig](#RequestConfig)
6565
- [IntervalTime](#IntervalTime)
6666
- [XCrawlBaseConfig](#XCrawlBaseConfig)
67-
- [CrawlPageConfig](#CrawlPageConfig)
6867
- [CrawlBaseConfigV1](#CrawlBaseConfigV1)
68+
- [CrawlBaseConfigV2](#CrawlBaseConfigV2)
69+
- [CrawlPageConfig](#CrawlPageConfig)
6970
- [CrawlDataConfig](#CrawlDataConfig)
7071
- [CrawlFileConfig](#CrawlFileConfig)
7172
- [StartPollingConfig](#StartPollingConfig)
73+
- [XCrawlInstance](#XCrawlInstance)
7274
- [CrawlResCommonV1](#CrawlResCommonV1)
7375
- [CrawlResCommonArrV1](#CrawlResCommonArrV1)
7476
- [CrawlPage](#CrawlPage-1)
@@ -740,31 +742,39 @@ interface XCrawlBaseConfig {
740742
}
741743
```
742744

743-
### CrawlPageConfig
745+
### CrawlBaseConfigV1
744746

745747
```ts
746-
type CrawlPageConfig = string | RequestConfigObjectV1
748+
interface CrawlBaseConfigV1 extends RequestConfigObjectV1 {
749+
cookies?: string | Protocol.Network.CookieParam | Protocol.Network.CookieParam[] // Protocol 来自 puppeteer 库
750+
}
747751
```
748752

749-
### CrawlBaseConfigV1
753+
### CrawlBaseConfigV2
750754

751755
```ts
752-
interface CrawlBaseConfigV1 {
756+
interface CrawlBaseConfigV2 {
753757
requestConfig: RequestConfig | RequestConfig[]
754758
intervalTime?: IntervalTime
755759
}
756760
```
757761

762+
### CrawlPageConfig
763+
764+
```ts
765+
type CrawlPageConfig = string | CrawlBaseConfigV1
766+
```
767+
758768
### CrawlDataConfig
759769

760770
```ts
761-
interface CrawlDataConfig extends CrawlBaseConfigV1 {}
771+
interface CrawlDataConfig extends CrawlBaseConfigV2 {}
762772
```
763773

764774
### CrawlFileConfig
765775

766776
```ts
767-
interface CrawlFileConfig extends CrawlBaseConfigV1 {
777+
interface CrawlFileConfig extends CrawlBaseConfigV2 {
768778
fileConfig: {
769779
storeDir: string // 存放文件夹
770780
extension?: string // 文件扩展名

package.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"private": true,
33
"name": "x-crawl",
4-
"version": "3.2.13",
4+
"version": "3.3.0",
55
"author": "coderHXL",
66
"description": "x-crawl is a flexible nodejs crawler library.",
77
"license": "MIT",

publish/package.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "x-crawl",
3-
"version": "3.2.13",
3+
"version": "3.3.0",
44
"author": "coderHXL",
55
"description": "x-crawl is a flexible nodejs crawler library.",
66
"license": "MIT",

src/api.ts

+38-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ import fs from 'node:fs'
22
import { writeFile } from 'node:fs/promises'
33
import path from 'node:path'
44
import { JSDOM } from 'jsdom'
5-
import puppeteer, { Browser } from 'puppeteer'
5+
import puppeteer, { Browser, Protocol } from 'puppeteer'
66

77
import { batchRequest, syncBatchRequest } from './request'
88
import { quickSort } from './sort'
@@ -18,6 +18,7 @@ import {
1818
} from './utils'
1919

2020
import {
21+
Cookies,
2122
CrawlDataConfig,
2223
CrawlFileConfig,
2324
CrawlPage,
@@ -93,6 +94,36 @@ async function useBatchRequestByMode(
9394
}
9495
}
9596

97+
function parseCrawlPageCookies(
98+
url: string,
99+
cookies: Cookies
100+
): Protocol.Network.CookieParam[] {
101+
const cookiesArr: Protocol.Network.CookieParam[] = []
102+
103+
if (typeof cookies === 'string') {
104+
cookies.split('; ').forEach((item) => {
105+
const cookie = item.split('=')
106+
cookiesArr.push({ name: cookie[0], value: cookie[1], url })
107+
})
108+
} else if (Array.isArray(cookies)) {
109+
cookies.forEach((cookie) => {
110+
if (!cookie.url) {
111+
cookie.url = url
112+
}
113+
114+
cookiesArr.push(cookie)
115+
})
116+
} else if (typeof cookies === 'object' && cookies) {
117+
if (!cookies.url) {
118+
cookies.url = url
119+
}
120+
121+
cookiesArr.push(cookies)
122+
}
123+
124+
return cookiesArr
125+
}
126+
96127
export function createCrawlPage(baseConfig: LoaderXCrawlBaseConfig) {
97128
let browser: Browser | null = null
98129
let createBrowserState: Promise<void> | null = null
@@ -143,6 +174,12 @@ export function createCrawlPage(baseConfig: LoaderXCrawlBaseConfig) {
143174
await page.setExtraHTTPHeaders(Headers as any as Record<string, string>)
144175
}
145176

177+
if (requestConfig.cookies) {
178+
await page.setCookie(
179+
...parseCrawlPageCookies(requestConfig.url, requestConfig.cookies)
180+
)
181+
}
182+
146183
let httpResponse = null
147184
try {
148185
httpResponse = await page!.goto(requestConfig.url, {

src/types/api.ts

+15-6
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import { IncomingHttpHeaders } from 'node:http'
2-
import { Browser, HTTPResponse, Page } from 'puppeteer'
2+
import { Browser, HTTPResponse, Page, Protocol } from 'puppeteer'
33
import { JSDOM } from 'jsdom'
44

55
import { RequestConfigObjectV1, RequestConfigObjectV2 } from './request'
@@ -23,7 +23,7 @@ export type MergeConfigRawConfig = {
2323
}
2424

2525
export type MergeConfigV1 = {
26-
requestConfig: RequestConfigObjectV1[]
26+
requestConfig: CrawlBaseConfigV1[]
2727
intervalTime?: IntervalTime
2828
}
2929

@@ -36,16 +36,25 @@ export type MergeConfigV2<T extends AnyObject> = MapTypeObject<
3636
}
3737

3838
/* API Config */
39-
export type CrawlPageConfig = string | RequestConfigObjectV1
39+
export type Cookies =
40+
| string
41+
| Protocol.Network.CookieParam
42+
| Protocol.Network.CookieParam[]
43+
44+
export interface CrawlBaseConfigV1 extends RequestConfigObjectV1 {
45+
cookies?: Cookies
46+
}
4047

41-
export interface CrawlBaseConfigV1 {
48+
export interface CrawlBaseConfigV2 {
4249
requestConfig: RequestConfig | RequestConfig[]
4350
intervalTime?: IntervalTime
4451
}
4552

46-
export interface CrawlDataConfig extends CrawlBaseConfigV1 {}
53+
export type CrawlPageConfig = string | CrawlBaseConfigV1
54+
55+
export interface CrawlDataConfig extends CrawlBaseConfigV2 {}
4756

48-
export interface CrawlFileConfig extends CrawlBaseConfigV1 {
57+
export interface CrawlFileConfig extends CrawlBaseConfigV2 {
4958
fileConfig: {
5059
storeDir: string
5160
extension?: string

0 commit comments

Comments
 (0)