@@ -12,11 +12,11 @@ import { JSONPath } from 'jsonpath-plus';
1212const baseUrl = 'https://www.gq.com.tw' ;
1313
1414const categoryTitleMap : Record < string , string > = {
15- life : 'Life ' ,
16- fashion : 'Fashion ' ,
17- entertainment : 'Entertainment ' ,
18- gadget : 'Gadget ' ,
19- bettermen : 'Better Men ' ,
15+ life : 'LIFE ' ,
16+ fashion : 'FASHION ' ,
17+ entertainment : 'ENTERTAINMENT ' ,
18+ gadget : 'GADGET ' ,
19+ bettermen : 'BETTER MEN ' ,
2020} ;
2121
2222const subcategoryTitleMaps : Record < string , Record < string , string > > = {
@@ -104,73 +104,72 @@ async function handler(ctx: Context): Promise<Data> {
104104 }
105105
106106 const listUrl = `${ baseUrl } /${ category } ${ subcategory ? '/' + subcategory : '' } ` ;
107- const items = await parseWebpage ( listUrl ) ;
107+ const { items, headTitle } = await parseWebpage ( listUrl ) ;
108108 logger . info ( `[gq/tw] fetched ${ items . length } items from ${ listUrl } ` ) ;
109109
110110 const categoryTitle = categoryTitleMap [ category ] ;
111111 const subcategoryTitle = subcategory ? subcategoryTitleMaps [ category ] [ subcategory ] : undefined ;
112- const title = subcategory ? `GQ台灣 - ${ categoryTitle } /${ subcategoryTitle } ` : `GQ台灣 - ${ categoryTitle } ` ;
112+ const fallbackTitle = subcategory ? `${ subcategoryTitle } | GQ Taiwan` : `${ categoryTitle } | GQ Taiwan` ;
113+ const title = headTitle || fallbackTitle ;
113114 return {
114115 title,
115116 link : listUrl ,
116117 item : items . slice ( 0 , limit ) ,
117118 } ;
118119}
120+ interface PageParseResult {
121+ items : DataItem [ ] ;
122+ headTitle ?: string ;
123+ }
119124
120- async function parseWebpage ( url : string ) : Promise < DataItem [ ] > {
125+ async function parseWebpage ( url : string ) : Promise < PageParseResult > {
121126 const html = await ofetch ( url ) ;
122127 const $ = load ( html ) ;
123- const containers = $ ( 'div[class^="SummaryCollectionGridItems"]' ) ;
124- const wrappers = containers . find ( 'div[class^="SummaryItemWrapper"]' ) ;
125-
126- const urlMetaMap = buildUrlMetaMap ( extractPreloadedStateObject ( $ ) , baseUrl ) ;
127-
128- const items = wrappers
129- . toArray ( )
130- . map ( ( el ) => {
131- const $el = $ ( el ) ;
132-
133- const linkEl = $el . find ( 'div[class^="SummaryItemContent"] a' ) . first ( ) ;
134- const linkPath = linkEl . attr ( 'href' ) ?. trim ( ) ;
135- if ( ! linkPath ) {
136- return null ;
137- }
138- const link = linkPath . startsWith ( 'http' ) ? linkPath : new URL ( linkPath , baseUrl ) . toString ( ) ;
139-
140- const imgEl = $el . find ( 'div[class^="SummaryItemAssetContainer"] img' ) . first ( ) ;
141- const imgSrc = imgEl . attr ( 'src' ) ?. trim ( ) || imgEl . attr ( 'data-src' ) ?. trim ( ) || '' ;
142-
143- const title = $el
144- . find ( 'div[class^="SummaryItemContent"] a > h2' )
145- . text ( )
146- . trim ( ) ;
147-
148- const meta = urlMetaMap . get ( link ) ?? urlMetaMap . get ( decodeURI ( link ) ) ;
149- const pubDateText = meta ?. pubDate ;
150- const timeEl = $el . find ( 'div[class^="SummaryItemBylineWrapper"] > time' ) ;
151- const timeText = timeEl . text ( ) . trim ( ) ;
152- const pubDate = pubDateText ? parseDate ( pubDateText ) : ( timeText ? parseDate ( timeText , 'YYYY年M月D日' ) : undefined ) ;
153-
154- const textDescription = meta ?. description ;
155- const description = Boolean ( imgSrc ) || Boolean ( textDescription )
156- ? art ( path . join ( __dirname , 'templates/description.art' ) , { src : imgSrc || undefined , alt : title , text : textDescription } )
157- : undefined ;
158-
159- return {
160- title,
161- link,
162- pubDate,
163- description,
164- image : imgSrc || undefined ,
165- } as DataItem ;
166- } )
167- . filter ( Boolean ) as DataItem [ ] ;
168-
169- logger . info ( `[gq/tw] parsed ${ items . length } items from list page ${ url } ` ) ;
170-
171- return items ;
128+
129+ const stateObj = extractPreloadedStateObject ( $ ) ;
130+
131+ if ( ! stateObj || ! stateObj . transformed ) {
132+ throw new Error ( `Failed to extract preloaded state object from ${ url } ` ) ;
133+ }
134+
135+ const headTitle = String ( stateObj . transformed [ 'head.title' ] ) ;
136+
137+ const nodes = ( JSONPath ( {
138+ path : '$.transformed.bundle.containers[*].items[*]' ,
139+ json : stateObj ,
140+ } ) as any [ ] ) . filter ( ( node ) => node && node . url ) ;
141+
142+ const items : DataItem [ ] = nodes . map ( ( node : any ) => {
143+ const rawUrlPath = String ( node . url ) ;
144+ const urlPath = rawUrlPath . replaceAll ( String . raw `\u002F` , "/" ) ;
145+ const link = new URL ( urlPath , baseUrl ) . toString ( ) ;
146+
147+ const title = String ( node . dangerousHed ?? node . hed ?? '' ) . trim ( ) ;
148+ const pubDate = node . pubDate ? parseDate ( String ( node . pubDate ) ) : undefined ;
149+
150+ const imgSources = node . image ?. sources || undefined ;
151+ const imgSrc = imgSources ?. xxl ?. url || imgSources ?. lg ?. url || imgSources ?. sm ?. url || undefined ;
152+ const textDescription = node . dangerousDek ? String ( node . dangerousDek ) : undefined ;
153+ const description = ( Boolean ( imgSrc ) || Boolean ( textDescription ) )
154+ ? art ( path . join ( __dirname , 'templates/description.art' ) , { src : imgSrc , alt : title , text : textDescription } )
155+ : undefined ;
156+
157+ return {
158+ title,
159+ link,
160+ pubDate,
161+ description,
162+ image : imgSrc ,
163+ } as DataItem ;
164+ } ) ;
165+
166+ logger . info ( `[gq/tw] parsed ${ items . length } items from JSON state ${ url } ` ) ;
167+ return { items, headTitle } ;
172168}
173169
170+ /**
171+ * Extract preloaded state object from HTML
172+ */
174173function extractPreloadedStateObject ( $ : ReturnType < typeof load > ) : any | null {
175174 const stateScriptText = $ ( 'script' ) . filter ( ( _ , el ) => $ ( el ) . text ( ) . includes ( '__PRELOADED_STATE__' ) ) . text ( ) ;
176175 if ( ! stateScriptText ) {
@@ -182,38 +181,10 @@ function extractPreloadedStateObject($: ReturnType<typeof load>): any | null {
182181 const braceStart = stateScriptText . indexOf ( '{' , assignIndex ) ;
183182 const braceEnd = stateScriptText . lastIndexOf ( '}' ) ;
184183 if ( braceStart === - 1 || braceEnd === - 1 || braceEnd <= braceStart ) {
184+ logger . info ( '[gq/tw] __PRELOADED_STATE__ json is malformed' ) ;
185185 return null ;
186186 }
187187
188188 const jsonText = stateScriptText . slice ( braceStart , braceEnd + 1 ) ;
189189 return JSON . parse ( jsonText ) ;
190190}
191- interface UrlMeta {
192- pubDate ?: string ;
193- description ?: string ;
194- }
195-
196- function buildUrlMetaMap ( stateObj : any , baseUrl : string ) : Map < string , UrlMeta > {
197- if ( ! stateObj ) {
198- return new Map < string , UrlMeta > ( ) ;
199- }
200-
201- const items = JSONPath ( {
202- path : '$.transformed.bundle.containers[*].items[*]' ,
203- json : stateObj ,
204- } ) as any [ ] ;
205-
206- const entries : Array < [ string , UrlMeta ] > = items
207- . filter ( ( node : any ) => node && node . url )
208- . map ( ( node : any ) => {
209- const urlPath = String ( node . url ) . replaceAll ( String . raw `\u002F` , "/" ) ;
210- const absoluteUrl = new URL ( urlPath , baseUrl ) . toString ( ) ;
211- const meta : UrlMeta = {
212- pubDate : node . pubDate ? String ( node . pubDate ) : undefined ,
213- description : node . dangerousDek ? String ( node . dangerousDek ) : undefined ,
214- } ;
215- return [ absoluteUrl , meta ] ;
216- } ) ;
217-
218- return new Map < string , UrlMeta > ( entries ) ;
219- }
0 commit comments