@@ -2,17 +2,8 @@ import type { Element, Nodes as Hast } from "hast";
22import { select , selectAll } from "hast-util-select" ;
33import { toString as hastToString } from "hast-util-to-string" ;
44import { filter } from "unist-util-filter" ;
5- import { parents } from "unist-util-parents" ;
65import { classnames , hasAncestors , isStrInclude , matchString } from "./utils" ;
76
8- type ProxiedHast = Hast & { parent : ProxiedHast | null } ;
9-
10- declare module "hast" {
11- interface Element {
12- parent : ProxiedHast | null ;
13- }
14- }
15-
167const UNLIKELY_ROLES = [ "menu" , "menubar" , "complementary" , "navigation" , "alert" , "alertdialog" , "dialog" ] ;
178
189const REGEXPS = {
@@ -159,9 +150,9 @@ const removeEmptyFilter = (node: Hast) => {
159150export const readabilityExtractHast = ( hast : Hast ) : Hast => {
160151 const lang = String ( select ( "html" , hast ) ?. properties . lang || tryGetLang ( hast ) || "en" ) ;
161152 const body = select ( "body" , hast ) ?? hast ;
153+ const bodyText = hastToString ( body ) ;
162154
163- const proxiedHast = parents ( body ) as unknown as ProxiedHast ;
164- const baseFilterd = filter ( proxiedHast , ( node ) => {
155+ const baseFilterd = filter ( body , ( node ) => {
165156 if ( ! metadataFilter ( node as Hast ) ) {
166157 return false ;
167158 }
@@ -175,36 +166,41 @@ export const readabilityExtractHast = (hast: Hast): Hast => {
175166 return { type : "root" , children : [ ] } ;
176167 }
177168
178- const baseText = hastToString ( baseFilterd ) ;
169+ const baseFilterdText = hastToString ( baseFilterd ) ;
170+ let [ baseTree , baseText ] =
171+ baseFilterdText . length > bodyText . length / 3 || baseFilterdText . length > 5000
172+ ? ( [ baseFilterd as Hast , baseFilterdText ] as const )
173+ : ( [ body as Hast , bodyText ] as const ) ;
174+
179175 let minimalLength = lang in BASE_MINIMAL_LENGTH ? BASE_MINIMAL_LENGTH [ lang as keyof typeof BASE_MINIMAL_LENGTH ] : 500 ;
180176 if ( baseText . length < minimalLength ) {
181177 minimalLength = Math . max ( 0 , baseText . length - 200 ) ;
182178 }
183179
184- let bodyTree : Hast = baseFilterd ;
185180 for ( const selector of BODY_SELECTORS ) {
186- const body = { type : "root" as const , children : selectAll ( selector , baseFilterd ) } ;
187- const bodyText = hastToString ( body ) ;
181+ const content = { type : "root" as const , children : selectAll ( selector , baseFilterd ) } ;
182+ const contentText = hastToString ( content ) ;
188183
189- if ( bodyText . length < 25 ) {
184+ if ( contentText . length < 25 ) {
190185 continue ;
191186 }
192187
193- const links = selectAll ( "a" , body ) ;
188+ const links = selectAll ( "a" , content ) ;
194189 const linkText = links . map ( ( link ) => hastToString ( link ) ) . join ( "" ) ;
195190
196191 const linkDensity = linkText . length / bodyText . length ;
197192 if ( linkDensity > 0.4 ) {
198193 continue ;
199194 }
200195
201- if ( bodyText . length > minimalLength ) {
202- bodyTree = body ;
196+ if ( contentText . length > minimalLength ) {
197+ baseTree = content ;
198+ baseText = contentText ;
203199 break ;
204200 }
205201 }
206202
207- const finalTree = filter ( bodyTree , ( node ) => {
203+ const finalFilteredTree = filter ( baseTree , ( node ) => {
208204 if ( ! removeEmptyFilter ( node as Hast ) ) {
209205 return false ;
210206 }
@@ -215,5 +211,6 @@ export const readabilityExtractHast = (hast: Hast): Hast => {
215211 return true ;
216212 } ) as Hast ;
217213
214+ const finalTree = hastToString ( finalFilteredTree ) . length > baseText . length / 3 ? finalFilteredTree : baseTree ;
218215 return finalTree ;
219216} ;
0 commit comments