Skip to content

Commit 39edc69

Browse files
authored
Merge pull request #27 from inaridiy/feat/improve-accuracy
Feat/improve accuracy
2 parents 631750d + fd7348f commit 39edc69

File tree

5 files changed

+4950
-3957
lines changed

5 files changed

+4950
-3957
lines changed

.changeset/honest-pens-jam.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"webforai": patch
3+
---
4+
5+
Improve extract algorithm

packages/webforai/package.json

Lines changed: 7 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -4,15 +4,7 @@
44
"description": "A library that provides a web interface for AI",
55
"author": "inaridiy",
66
"license": "MIT",
7-
"keywords": [
8-
"web",
9-
"ai",
10-
"html",
11-
"html2md",
12-
"markdown",
13-
"mdast",
14-
"hast"
15-
],
7+
"keywords": ["web", "ai", "html", "html2md", "markdown", "mdast", "hast"],
168
"repository": {
179
"type": "git",
1810
"url": "https://github.com/inaridiy/webforai.git"
@@ -26,10 +18,7 @@
2618
"prerelease": "pnpm build",
2719
"release": "np"
2820
},
29-
"files": [
30-
"dist",
31-
"!dist/types/**/*.js"
32-
],
21+
"files": ["dist", "!dist/types/**/*.js"],
3322
"main": "dist/cjs/index.js",
3423
"type": "module",
3524
"module": "dist/index.js",
@@ -68,21 +57,11 @@
6857
},
6958
"typesVersions": {
7059
"*": {
71-
"types": [
72-
"./dist/types/index.d.ts"
73-
],
74-
"loaders/playwright": [
75-
"./dist/types/loaders/playwright.d.ts"
76-
],
77-
"loaders/cf-puppeteer": [
78-
"./dist/types/loaders/cf-puppeteer.d.ts"
79-
],
80-
"loaders/fetch": [
81-
"./dist/types/loaders/fetch.d.ts"
82-
],
83-
"loaders/puppeteer": [
84-
"./dist/types/loaders/puppeteer.d.ts"
85-
]
60+
"types": ["./dist/types/index.d.ts"],
61+
"loaders/playwright": ["./dist/types/loaders/playwright.d.ts"],
62+
"loaders/cf-puppeteer": ["./dist/types/loaders/cf-puppeteer.d.ts"],
63+
"loaders/fetch": ["./dist/types/loaders/fetch.d.ts"],
64+
"loaders/puppeteer": ["./dist/types/loaders/puppeteer.d.ts"]
8665
}
8766
},
8867
"peerDependencies": {

packages/webforai/src/extract-hast/readability.ts

Lines changed: 17 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,8 @@ import type { Element, Nodes as Hast } from "hast";
22
import { select, selectAll } from "hast-util-select";
33
import { toString as hastToString } from "hast-util-to-string";
44
import { filter } from "unist-util-filter";
5-
import { parents } from "unist-util-parents";
65
import { classnames, hasAncestors, isStrInclude, matchString } from "./utils";
76

8-
type ProxiedHast = Hast & { parent: ProxiedHast | null };
9-
10-
declare module "hast" {
11-
interface Element {
12-
parent: ProxiedHast | null;
13-
}
14-
}
15-
167
const UNLIKELY_ROLES = ["menu", "menubar", "complementary", "navigation", "alert", "alertdialog", "dialog"];
178

189
const REGEXPS = {
@@ -159,9 +150,9 @@ const removeEmptyFilter = (node: Hast) => {
159150
export const readabilityExtractHast = (hast: Hast): Hast => {
160151
const lang = String(select("html", hast)?.properties.lang || tryGetLang(hast) || "en");
161152
const body = select("body", hast) ?? hast;
153+
const bodyText = hastToString(body);
162154

163-
const proxiedHast = parents(body) as unknown as ProxiedHast;
164-
const baseFilterd = filter(proxiedHast, (node) => {
155+
const baseFilterd = filter(body, (node) => {
165156
if (!metadataFilter(node as Hast)) {
166157
return false;
167158
}
@@ -175,36 +166,41 @@ export const readabilityExtractHast = (hast: Hast): Hast => {
175166
return { type: "root", children: [] };
176167
}
177168

178-
const baseText = hastToString(baseFilterd);
169+
const baseFilterdText = hastToString(baseFilterd);
170+
let [baseTree, baseText] =
171+
baseFilterdText.length > bodyText.length / 3 || baseFilterdText.length > 5000
172+
? ([baseFilterd as Hast, baseFilterdText] as const)
173+
: ([body as Hast, bodyText] as const);
174+
179175
let minimalLength = lang in BASE_MINIMAL_LENGTH ? BASE_MINIMAL_LENGTH[lang as keyof typeof BASE_MINIMAL_LENGTH] : 500;
180176
if (baseText.length < minimalLength) {
181177
minimalLength = Math.max(0, baseText.length - 200);
182178
}
183179

184-
let bodyTree: Hast = baseFilterd;
185180
for (const selector of BODY_SELECTORS) {
186-
const body = { type: "root" as const, children: selectAll(selector, baseFilterd) };
187-
const bodyText = hastToString(body);
181+
const content = { type: "root" as const, children: selectAll(selector, baseFilterd) };
182+
const contentText = hastToString(content);
188183

189-
if (bodyText.length < 25) {
184+
if (contentText.length < 25) {
190185
continue;
191186
}
192187

193-
const links = selectAll("a", body);
188+
const links = selectAll("a", content);
194189
const linkText = links.map((link) => hastToString(link)).join("");
195190

196191
const linkDensity = linkText.length / bodyText.length;
197192
if (linkDensity > 0.4) {
198193
continue;
199194
}
200195

201-
if (bodyText.length > minimalLength) {
202-
bodyTree = body;
196+
if (contentText.length > minimalLength) {
197+
baseTree = content;
198+
baseText = contentText;
203199
break;
204200
}
205201
}
206202

207-
const finalTree = filter(bodyTree, (node) => {
203+
const finalFilteredTree = filter(baseTree, (node) => {
208204
if (!removeEmptyFilter(node as Hast)) {
209205
return false;
210206
}
@@ -215,5 +211,6 @@ export const readabilityExtractHast = (hast: Hast): Hast => {
215211
return true;
216212
}) as Hast;
217213

214+
const finalTree = hastToString(finalFilteredTree).length > baseText.length / 3 ? finalFilteredTree : baseTree;
218215
return finalTree;
219216
};

packages/webforai/src/html-to-markdown.test.ts

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -123,17 +123,17 @@ describe("htmlToMarkdown", () => {
123123
describe("htmlToMarkdown E2E", () => {
124124
it("should convert npm README to Markdown ", async () => {
125125
const html = await loadHtml("https://www.npmjs.com/package/webforai");
126-
const markdown = htmlToMarkdown(html, { baseUrl: "https://www.npmjs.com/package/webforai" });
126+
const markdown = htmlToMarkdown(html);
127127

128128
// @ts-ignore
129129
const original = await import("../README.md?raw");
130130
const d = distance(markdown, original.default);
131-
expect(d).lte(400); // I'd like to optimise more!
131+
expect(d).lte(200); // I'd like to optimise more!
132132
});
133133

134134
it("should convert GitHub README to Markdown ", async () => {
135135
const html = await loadHtml("https://github.com/inaridiy/webforai");
136-
const markdown = htmlToMarkdown(html, { baseUrl: "https://www.npmjs.com/package/webforai" });
136+
const markdown = htmlToMarkdown(html);
137137

138138
// @ts-ignore
139139
const original = await import("../../../README.md?raw");

0 commit comments

Comments
 (0)