Skip to content

Commit fbbec60

Browse files
committed
fix: code review
- use title from the embedded json by default. fallback title follows the same format - extract data from the embedded json instead of html elements
1 parent 3237d23 commit fbbec60

File tree

1 file changed

+57
-86
lines changed

1 file changed

+57
-86
lines changed

lib/routes/gq/tw/tw.ts

Lines changed: 57 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,11 @@ import { JSONPath } from 'jsonpath-plus';
1212
const baseUrl = 'https://www.gq.com.tw';
1313

1414
const categoryTitleMap: Record<string, string> = {
15-
life: 'Life',
16-
fashion: 'Fashion',
17-
entertainment: 'Entertainment',
18-
gadget: 'Gadget',
19-
bettermen: 'Better Men',
15+
life: 'LIFE',
16+
fashion: 'FASHION',
17+
entertainment: 'ENTERTAINMENT',
18+
gadget: 'GADGET',
19+
bettermen: 'BETTER MEN',
2020
};
2121

2222
const subcategoryTitleMaps: Record<string, Record<string, string>> = {
@@ -104,73 +104,72 @@ async function handler(ctx: Context): Promise<Data> {
104104
}
105105

106106
const listUrl = `${baseUrl}/${category}${subcategory ? '/' + subcategory : ''}`;
107-
const items = await parseWebpage(listUrl);
107+
const { items, headTitle } = await parseWebpage(listUrl);
108108
logger.info(`[gq/tw] fetched ${items.length} items from ${listUrl}`);
109109

110110
const categoryTitle = categoryTitleMap[category];
111111
const subcategoryTitle = subcategory ? subcategoryTitleMaps[category][subcategory] : undefined;
112-
const title = subcategory ? `GQ台灣 - ${categoryTitle}/${subcategoryTitle}` : `GQ台灣 - ${categoryTitle}`;
112+
const fallbackTitle = subcategory ? `${subcategoryTitle} | GQ Taiwan` : `${categoryTitle} | GQ Taiwan`;
113+
const title = headTitle || fallbackTitle;
113114
return {
114115
title,
115116
link: listUrl,
116117
item: items.slice(0, limit),
117118
};
118119
}
120+
interface PageParseResult {
121+
items: DataItem[];
122+
headTitle?: string;
123+
}
119124

120-
async function parseWebpage(url: string): Promise<DataItem[]> {
125+
async function parseWebpage(url: string): Promise<PageParseResult> {
121126
const html = await ofetch(url);
122127
const $ = load(html);
123-
const containers = $('div[class^="SummaryCollectionGridItems"]');
124-
const wrappers = containers.find('div[class^="SummaryItemWrapper"]');
125-
126-
const urlMetaMap = buildUrlMetaMap(extractPreloadedStateObject($), baseUrl);
127-
128-
const items = wrappers
129-
.toArray()
130-
.map((el) => {
131-
const $el = $(el);
132-
133-
const linkEl = $el.find('div[class^="SummaryItemContent"] a').first();
134-
const linkPath = linkEl.attr('href')?.trim();
135-
if (!linkPath) {
136-
return null;
137-
}
138-
const link = linkPath.startsWith('http') ? linkPath : new URL(linkPath, baseUrl).toString();
139-
140-
const imgEl = $el.find('div[class^="SummaryItemAssetContainer"] img').first();
141-
const imgSrc = imgEl.attr('src')?.trim() || imgEl.attr('data-src')?.trim() || '';
142-
143-
const title = $el
144-
.find('div[class^="SummaryItemContent"] a > h2')
145-
.text()
146-
.trim();
147-
148-
const meta = urlMetaMap.get(link) ?? urlMetaMap.get(decodeURI(link));
149-
const pubDateText = meta?.pubDate;
150-
const timeEl = $el.find('div[class^="SummaryItemBylineWrapper"] > time');
151-
const timeText = timeEl.text().trim();
152-
const pubDate = pubDateText ? parseDate(pubDateText) : (timeText ? parseDate(timeText, 'YYYY年M月D日') : undefined);
153-
154-
const textDescription = meta?.description;
155-
const description = Boolean(imgSrc) || Boolean(textDescription)
156-
? art(path.join(__dirname, 'templates/description.art'), { src: imgSrc || undefined, alt: title, text: textDescription })
157-
: undefined;
158-
159-
return {
160-
title,
161-
link,
162-
pubDate,
163-
description,
164-
image: imgSrc || undefined,
165-
} as DataItem;
166-
})
167-
.filter(Boolean) as DataItem[];
168-
169-
logger.info(`[gq/tw] parsed ${items.length} items from list page ${url}`);
170-
171-
return items;
128+
129+
const stateObj = extractPreloadedStateObject($);
130+
131+
if (!stateObj || !stateObj.transformed) {
132+
throw new Error(`Failed to extract preloaded state object from ${url}`);
133+
}
134+
135+
const headTitle = String(stateObj.transformed['head.title']);
136+
137+
const nodes = (JSONPath({
138+
path: '$.transformed.bundle.containers[*].items[*]',
139+
json: stateObj,
140+
}) as any[]).filter((node) => node && node.url);
141+
142+
const items: DataItem[] = nodes.map((node: any) => {
143+
const rawUrlPath = String(node.url);
144+
const urlPath = rawUrlPath.replaceAll(String.raw`\u002F`, "/");
145+
const link = new URL(urlPath, baseUrl).toString();
146+
147+
const title = String(node.dangerousHed ?? node.hed ?? '').trim();
148+
const pubDate = node.pubDate ? parseDate(String(node.pubDate)) : undefined;
149+
150+
const imgSources = node.image?.sources || undefined;
151+
const imgSrc = imgSources?.xxl?.url || imgSources?.lg?.url || imgSources?.sm?.url || undefined;
152+
const textDescription = node.dangerousDek ? String(node.dangerousDek) : undefined;
153+
const description = (Boolean(imgSrc) || Boolean(textDescription))
154+
? art(path.join(__dirname, 'templates/description.art'), { src: imgSrc, alt: title, text: textDescription })
155+
: undefined;
156+
157+
return {
158+
title,
159+
link,
160+
pubDate,
161+
description,
162+
image: imgSrc,
163+
} as DataItem;
164+
});
165+
166+
logger.info(`[gq/tw] parsed ${items.length} items from JSON state ${url}`);
167+
return { items, headTitle };
172168
}
173169

170+
/**
171+
* Extract preloaded state object from HTML
172+
*/
174173
function extractPreloadedStateObject($: ReturnType<typeof load>): any | null {
175174
const stateScriptText = $('script').filter((_, el) => $(el).text().includes('__PRELOADED_STATE__')).text();
176175
if (!stateScriptText) {
@@ -182,38 +181,10 @@ function extractPreloadedStateObject($: ReturnType<typeof load>): any | null {
182181
const braceStart = stateScriptText.indexOf('{', assignIndex);
183182
const braceEnd = stateScriptText.lastIndexOf('}');
184183
if (braceStart === -1 || braceEnd === -1 || braceEnd <= braceStart) {
184+
logger.info('[gq/tw] __PRELOADED_STATE__ json is malformed');
185185
return null;
186186
}
187187

188188
const jsonText = stateScriptText.slice(braceStart, braceEnd + 1);
189189
return JSON.parse(jsonText);
190190
}
191-
interface UrlMeta {
192-
pubDate?: string;
193-
description?: string;
194-
}
195-
196-
function buildUrlMetaMap(stateObj: any, baseUrl: string): Map<string, UrlMeta> {
197-
if (!stateObj) {
198-
return new Map<string, UrlMeta>();
199-
}
200-
201-
const items = JSONPath({
202-
path: '$.transformed.bundle.containers[*].items[*]',
203-
json: stateObj,
204-
}) as any[];
205-
206-
const entries: Array<[string, UrlMeta]> = items
207-
.filter((node: any) => node && node.url)
208-
.map((node: any) => {
209-
const urlPath = String(node.url).replaceAll(String.raw`\u002F`, "/");
210-
const absoluteUrl = new URL(urlPath, baseUrl).toString();
211-
const meta: UrlMeta = {
212-
pubDate: node.pubDate ? String(node.pubDate) : undefined,
213-
description: node.dangerousDek ? String(node.dangerousDek) : undefined,
214-
};
215-
return [absoluteUrl, meta];
216-
});
217-
218-
return new Map<string, UrlMeta>(entries);
219-
}

0 commit comments

Comments
 (0)