Skip to content

Commit c82d560

Browse files
committed
Migrate to cheerio
1 parent 3dc3c00 commit c82d560

15 files changed

+664
-447
lines changed

README.md

+82-20
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,31 @@ it is TypeScript implementation of [Obelisk](https://github.com/go-shiori/obelis
1717

1818
## Usage
1919

20+
### As CLI tool
21+
22+
```sh
23+
npm install -g @wabarc/cairn
24+
```
25+
26+
```sh
27+
$ cairn -h
28+
29+
Usage: cairn [options] url1 [url2]...[urlN]
30+
31+
CLI tool for saving web page as single HTML file
32+
33+
Options:
34+
-v, --version output the current version
35+
-o, --output <string> path to save archival result
36+
-u, --user-agent <string> set custom user agent
37+
-t, --timeout <number> maximum time (in second) request timeout
38+
--no-js disable JavaScript
39+
--no-css disable CSS styling
40+
--no-embeds remove embedded elements (e.g iframe)
41+
--no-medias remove media elements (e.g img, audio)
42+
-h, --help display help for command
43+
```
44+
2045
### As npm package
2146

2247
```sh
@@ -25,42 +50,79 @@ npm install @wabarc/cairn
2550

2651
```javascript
2752
import { Cairn } from '@wabarc/cairn';
53+
// const cairn = require('@wabarc/cairn');
2854

2955
const cairn = new Cairn();
3056

3157
cairn
3258
.request({ url: url })
33-
.options({ userAgent: 'Cairn/1.0.0' })
59+
.options({ userAgent: 'Cairn/2.0.0' })
3460
.archive()
35-
.then((webpage) => {
36-
console.log(url, webpage);
61+
.then((archived) => {
62+
console.log(archived.url, archived.webpage.html());
3763
})
3864
.catch((err) => console.warn(`${url} => ${JSON.stringify(err)}`));
3965
```
4066

41-
### As CLI tool
67+
#### Instance methods
4268

43-
```sh
44-
npm install -g @wabarc/cairn
69+
##### cairn#request({ url: string }): this
70+
##### cairn#options({}): this
71+
- userAgent?: string;
72+
- disableJS?: boolean;
73+
- disableCSS?: boolean;
74+
- disableEmbeds?: boolean;
75+
- disableMedias?: boolean;
76+
- timeout?: number;
77+
78+
##### cairn#archive(): Promise<Archived>
79+
##### cairn#Archived
80+
- url: string;
81+
- webpage: cheerio.Root;
82+
- status: 200 | 400 | 401 | 403 | 404 | 500 | 502 | 503 | 504;
83+
- contentType: 'text/html' | 'text/plain' | 'text/*';
84+
85+
#### Request Params
86+
87+
##### request
88+
89+
```javascript
90+
{
91+
// `url` is archival target.
92+
url: 'https://www.github.com'
93+
}
4594
```
4695

47-
```sh
48-
$ cairn -h
96+
##### options
4997

50-
Usage: cairn [options] url1 [url2]...[urlN]
98+
```javascript
99+
{
100+
userAgent: 'Cairn/2.0.0',
51101

52-
CLI tool for saving web page as single HTML file
102+
disableJS: true,
103+
disableCSS: false,
104+
disableEmbeds: false,
105+
disableMedias: true,
53106

54-
Options:
55-
-v, --version output the current version
56-
-o, --output <string> path to save archival result
57-
-u, --user-agent <string> set custom user agent
58-
-t, --timeout <number> maximum time (in second) request timeout
59-
--no-js disable JavaScript
60-
--no-css disable CSS styling
61-
--no-embeds remove embedded elements (e.g iframe)
62-
--no-medias remove media elements (e.g img, audio)
63-
-h, --help display help for command
107+
timeout: 30
108+
}
109+
```
110+
111+
#### Response Schema
112+
113+
for v1.x:
114+
115+
The `archive` method will return webpage body as string.
116+
117+
for v2.x:
118+
119+
```javascript
120+
{
121+
url: 'https://github.com/',
122+
webpage: cheerio.Root,
123+
status: 200,
124+
contentType: 'text/html'
125+
}
64126
```
65127

66128
## License

package.json

+5-3
Original file line numberDiff line numberDiff line change
@@ -37,11 +37,12 @@
3737
},
3838
"homepage": "https://github.com/wabarc/cairn#readme",
3939
"dependencies": {
40-
"axios": "^0.20.0",
41-
"commander": "^6.1.0",
42-
"jsdom": "^16.4.0"
40+
"axios": "^0.21.0",
41+
"cheerio": "^1.0.0-rc.3",
42+
"commander": "^6.1.0"
4343
},
4444
"devDependencies": {
45+
"@types/cheerio": "^0.22.22",
4546
"@types/jest": "^26.0.15",
4647
"@types/node": "^14.14.2",
4748
"@typescript-eslint/eslint-plugin": "^4.5.0",
@@ -50,6 +51,7 @@
5051
"eslint-plugin-jest": "^24.1.0",
5152
"eslint-plugin-prettier": "^3.1.4",
5253
"jest": "^26.6.1",
54+
"jsdom": "^16.4.0",
5355
"nodemon": "^2.0.6",
5456
"prettier": "^2.1.2",
5557
"ts-jest": "^26.4.2",

src/archiver.ts

+21-37
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
import { Archiver as ArchiverImpl, Options, Requests, Webpage } from './types/cairn';
2-
import { Err, HTTP, isValidURL } from './utils';
1+
import { Archiver as ArchiverImpl, Options, Requests, Archived } from './types/cairn';
2+
import { err, http, isValidURL } from './utils';
33
import { HTML } from './html';
44

55
export class Archiver implements ArchiverImpl {
@@ -23,7 +23,7 @@ export class Archiver implements ArchiverImpl {
2323
request(r: Requests): this {
2424
const { url } = r;
2525
if (!isValidURL(url)) {
26-
Err('request url is not specified');
26+
err('request url is not specified');
2727
}
2828

2929
this.req.url = url;
@@ -50,43 +50,27 @@ export class Archiver implements ArchiverImpl {
5050
* @return {Promise} with string
5151
* @api public
5252
*/
53-
async archive(): Promise<string> {
54-
return await (async () => {
55-
let webpage: Webpage;
56-
let content = '';
57-
let process = false;
53+
async archive(): Promise<Archived> {
54+
const archived: Archived = { url: this.req.url, webpage: null, status: 400, contentType: 'text/html' };
55+
const response = await this.download(this.req.url).catch((err) => err(err));
56+
if (response.isAxiosError === true || !response.headers) {
57+
return archived;
58+
}
59+
60+
const contentType = response.headers['content-type'] || response.headers['Content-Type'] || '';
61+
// Check the type of the downloaded file.
62+
// If it's not HTML, just return it as it is.
63+
if (contentType.includes('text/html') === true) {
64+
// If it's HTML process it
65+
archived.webpage = await new HTML(this.opt).process({ uri: this.req.url, html: response.data });
66+
}
67+
archived.status = response.status || archived.status;
68+
archived.contentType = contentType;
5869

59-
return await this.download(this.req.url)
60-
.then((response) => {
61-
// Check the type of the downloaded file.
62-
// If it's not HTML, just return it as it is.
63-
if (response.isAxiosError === true) {
64-
return content;
65-
}
66-
if (!response.headers) {
67-
return content;
68-
}
69-
const contentType = response.headers['content-type'] || response.headers['Content-Type'] || '';
70-
process = contentType.includes('text/html');
71-
webpage = { uri: this.req.url, content: response.data, contentType: contentType };
72-
})
73-
.then(async () => {
74-
if (process === true) {
75-
// If it's HTML process it
76-
content = await new HTML(this.opt).process(webpage);
77-
}
78-
return content;
79-
})
80-
.catch((err) => {
81-
console.warn(err);
82-
return content;
83-
});
84-
})();
70+
return archived;
8571
}
8672

8773
async download(url: string, referer?: string): Promise<any> {
88-
const http = new HTTP();
89-
9074
if (this.opt.userAgent) {
9175
http.setHeader('User-Agent', this.opt.userAgent);
9276
}
@@ -95,6 +79,6 @@ export class Archiver implements ArchiverImpl {
9579
http.setOptions({ timeout: this.opt.timeout });
9680
}
9781

98-
return await http.fetch(url).catch((err) => Err(err));
82+
return await http.setResponseType('text').fetch(url);
9983
}
10084
}

src/cairn.ts

+4-31
Original file line numberDiff line numberDiff line change
@@ -1,43 +1,16 @@
1-
import { Options, Requests } from './types/cairn';
21
import { Archiver } from './archiver';
2+
export { Archived } from './types';
33

44
process.on('uncaughtException', (e) => {
55
console.error(e);
66
});
77

8-
class Cairn {
9-
private arc: Archiver;
8+
class Cairn extends Archiver {}
109

11-
/**
12-
* Initialize a new `Cairn`.
13-
*
14-
* @api public
15-
*/
16-
constructor() {
17-
this.arc = new Archiver();
18-
}
19-
20-
request(r: Requests): this {
21-
this.arc.request(r);
22-
23-
return this;
24-
}
25-
26-
options(o: Options): this {
27-
this.arc.options(o);
28-
return this;
29-
}
30-
31-
archive(): Promise<string> {
32-
return this.arc.archive();
33-
}
34-
}
10+
const cairn = new Cairn();
3511

36-
exports = module.exports = new Cairn();
12+
exports = module.exports = cairn;
3713
exports.cairn = exports;
38-
3914
exports.Cairn = Cairn;
4015

41-
const cairn = new Cairn();
42-
4316
export { Cairn, cairn };

src/cli.ts

+16-8
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
11
#!/usr/bin/env node
2-
32
import { Options } from './types/cairn';
43
import { Command } from 'commander';
5-
import { Cairn } from './cairn';
4+
import { Archiver } from './archiver';
65
import { isValidURL, createFileName } from './utils';
76
import { statSync, writeFile } from 'fs';
87

@@ -15,7 +14,7 @@ class Handler {
1514
this.opt = {};
1615
}
1716

18-
main() {
17+
async main() {
1918
const program = this.parser();
2019

2120
if (this.url.length < 1) {
@@ -32,7 +31,7 @@ class Handler {
3231
filepath = program.output + '/';
3332
}
3433

35-
const output = (url: string, filename: string, content: string) => {
34+
const output = async (url: string, filename: string, content: string) => {
3635
if (program.output === '-') {
3736
console.info(content);
3837
} else {
@@ -46,20 +45,29 @@ class Handler {
4645
}
4746
};
4847

49-
const cairn = new Cairn();
48+
const cairn = new Archiver();
5049
for (const url of this.url) {
5150
if (!isValidURL(url)) {
5251
console.info(`${url} => request url is not specified\n`);
5352
continue;
5453
}
5554
const filename = filepath + createFileName(url);
5655

57-
cairn
56+
await cairn
5857
.request({ url: url })
5958
.options(this.opt)
6059
.archive()
61-
.then((webpage) => {
62-
output(url, filename, webpage);
60+
.then(async (archived) => {
61+
if (!archived.webpage || typeof archived.webpage.root !== 'function') {
62+
return;
63+
}
64+
65+
const html = archived.webpage.root() ? archived.webpage.root().html() : '';
66+
if (!html) {
67+
console.warn(`${url} => archival failure. [status: ${archived.status}]`);
68+
return;
69+
}
70+
await output(url, filename, html || '');
6371
})
6472
.catch((err) => console.warn(`${url} => ${JSON.stringify(err)}`));
6573
}

0 commit comments

Comments
 (0)