@@ -146,26 +146,23 @@ export async function getCategoryList(): Promise<Category[]> {
146146}
147147
148148/**
149- * 对标题进行分词 ,支持中英文混合
149+ * 对文本进行分词 ,支持中英文混合
150150 *
151151 * - 优先使用 Intl.Segmenter(在支持的运行时中效果更好)
152152 * - 在不支持 Segmenter 的环境(如部分 Node 运行时)下
153153 * 回退到基于正则的简单分词,以避免构建报错
154154 * - 过滤标点和空白,英文统一小写
155155 */
156- function tokenizeTitle ( title : string ) : Set < string > {
156+ function tokenize ( text : string ) : Set < string > {
157157 const tokens = new Set < string > ( ) ;
158158
159- // 运行时可能不支持 Intl.Segmenter(例如部分 Node 环境)
160- // 为了避免 SSR/构建时报错,这里做兼容处理
161159 const hasSegmenter =
162160 typeof Intl !== "undefined" &&
163161 "Segmenter" in Intl &&
164162 typeof ( Intl as any ) . Segmenter === "function" ;
165163
166164 if ( ! hasSegmenter ) {
167- // 简单回退方案:按照空白和标点拆分
168- const basicTokens = title
165+ const basicTokens = text
169166 . toLowerCase ( )
170167 . split ( / [ \s \p{ P} ] + / gu)
171168 . filter ( Boolean ) ;
@@ -175,11 +172,10 @@ function tokenizeTitle(title: string): Set<string> {
175172 return tokens ;
176173 }
177174
178- // 使用 Intl.Segmenter 进行更精细的中英文混合分词
179175 const segmenter = new ( Intl as any ) . Segmenter ( "zh" , {
180176 granularity : "word" ,
181177 } ) ;
182- for ( const { segment, isWordLike } of segmenter . segment ( title ) ) {
178+ for ( const { segment, isWordLike } of segmenter . segment ( text ) ) {
183179 if ( ! isWordLike ) {
184180 continue ;
185181 }
@@ -206,17 +202,85 @@ function jaccardSimilarity(a: Set<string>, b: Set<string>): number {
206202}
207203
208204/**
209- * 获取相关文章推荐
210- * 评分公式: totalScore = tagMatchScore + titleSimilarityScore + timeFreshnessScore + categoryBonus
211- * - tagMatchScore (0-100): 标签 Jaccard 相似度 × 100
212- * - titleSimilarityScore (0-100): 标题分词 Jaccard 相似度 × 100
213- * - timeFreshnessScore (0-30): 6 个月半衰期指数衰减
214- * - categoryBonus (0 or 10): 同分类加 10 分
205+ * 计算标签的 IDF(逆文档频率)权重
206+ * 稀有标签(出现频率低)获得更高权重,常见标签权重更低
207+ * IDF(tag) = log(N / (1 + df(tag))),N = 总文章数,df = 包含该标签的文章数
208+ */
209+ function computeTagIDF ( allPosts : { data : { tags ?: string [ ] } } [ ] ) : Map < string , number > {
210+ const tagDF = new Map < string , number > ( ) ;
211+ const N = allPosts . length ;
212+
213+ for ( const post of allPosts ) {
214+ const tags = post . data . tags || [ ] ;
215+ for ( const tag of tags ) {
216+ tagDF . set ( tag , ( tagDF . get ( tag ) || 0 ) + 1 ) ;
217+ }
218+ }
219+
220+ const tagIDF = new Map < string , number > ( ) ;
221+ for ( const [ tag , df ] of tagDF ) {
222+ tagIDF . set ( tag , Math . log ( N / ( 1 + df ) ) ) ;
223+ }
224+ return tagIDF ;
225+ }
226+
227+ /**
228+ * 计算 IDF 加权标签相似度
229+ * 对共有标签的 IDF 值求和,归一化到 [0, 1]
230+ */
231+ function idfWeightedTagSimilarity (
232+ currentTags : string [ ] ,
233+ candidateTags : string [ ] ,
234+ tagIDF : Map < string , number > ,
235+ ) : number {
236+ if ( currentTags . length === 0 || candidateTags . length === 0 ) {
237+ return 0 ;
238+ }
239+
240+ const candidateSet = new Set ( candidateTags ) ;
241+ let intersectionWeight = 0 ;
242+ let currentTotalWeight = 0 ;
243+
244+ for ( const tag of currentTags ) {
245+ const idf = tagIDF . get ( tag ) ?? 0 ;
246+ currentTotalWeight += idf ;
247+ if ( candidateSet . has ( tag ) ) {
248+ intersectionWeight += idf ;
249+ }
250+ }
251+
252+ return currentTotalWeight === 0 ? 0 : intersectionWeight / currentTotalWeight ;
253+ }
254+
255+ /**
256+ * 获取相关文章推荐 — 多算法加权评分
257+ *
258+ * 评分维度(权重可通过 relatedPostsConfig.weights 配置):
259+ * - tagSimilarity: 标签相似度(Jaccard 或 IDF 加权)
260+ * - titleSimilarity: 标题分词 Jaccard 相似度
261+ * - descriptionSimilarity: 描述文本分词相似度
262+ * - categoryMatch: 同分类加分
263+ * - freshness: 时间新鲜度(指数衰减)
264+ *
265+ * 总分 = Σ(维度分数 × 权重) / Σ权重
215266 */
216267export async function getRelatedPosts (
217268 currentPost : CollectionEntry < "posts" > ,
218269 maxCount = 5 ,
219270) : Promise < PostForList [ ] > {
271+ const { relatedPostsConfig } = await import ( "../config/index.js" ) ;
272+ const weights = relatedPostsConfig . weights ?? { } ;
273+ const halfLife = relatedPostsConfig . freshnessHalfLife ?? 180 ;
274+
275+ const w = {
276+ tagSimilarity : weights . tagSimilarity ?? 1.0 ,
277+ titleSimilarity : weights . titleSimilarity ?? 0.6 ,
278+ descriptionSimilarity : weights . descriptionSimilarity ?? 0.4 ,
279+ categoryMatch : weights . categoryMatch ?? 0.3 ,
280+ freshness : weights . freshness ?? 0.2 ,
281+ useIDF : weights . tagIDF ?? true ,
282+ } ;
283+
220284 const allPosts = await getCollection < "posts" > ( "posts" , ( { data } ) => {
221285 return import . meta. env . PROD ? data . draft !== true : true ;
222286 } ) ;
@@ -226,81 +290,81 @@ export async function getRelatedPosts(
226290 ( p ) => p . id !== currentPost . id && ! p . data . password ,
227291 ) ;
228292
229- const currentTags = new Set ( currentPost . data . tags || [ ] ) ;
230- const currentTokens = tokenizeTitle ( currentPost . data . title ) ;
293+ if ( candidates . length === 0 ) return [ ] ;
294+
295+ const currentTags = currentPost . data . tags || [ ] ;
296+ const currentTokens = tokenize ( currentPost . data . title ) ;
297+ const currentDesc = tokenize ( currentPost . data . description || "" ) ;
231298 const currentCategory = currentPost . data . category || "" ;
232299 const now = Date . now ( ) ;
233300
234- const scored = candidates . map ( ( post ) => {
235- const postTags = new Set ( post . data . tags || [ ] ) ;
301+ // 预计算标签 IDF
302+ const tagIDF = w . useIDF ? computeTagIDF ( allPosts ) : new Map < string , number > ( ) ;
236303
237- // tagMatchScore (0-100)
238- const tagMatchScore = jaccardSimilarity ( currentTags , postTags ) * 100 ;
304+ // 权重总和(用于归一化)
305+ const totalWeight =
306+ w . tagSimilarity + w . titleSimilarity + w . descriptionSimilarity + w . categoryMatch + w . freshness ;
239307
240- // titleSimilarityScore (0-100)
241- const postTokens = tokenizeTitle ( post . data . title ) ;
242- const titleSimilarityScore =
243- jaccardSimilarity ( currentTokens , postTokens ) * 100 ;
308+ const scored = candidates . map ( ( post ) => {
309+ const postTags = post . data . tags || [ ] ;
310+
311+ // 标签相似度
312+ let tagScore : number ;
313+ if ( w . useIDF && currentTags . length > 0 && postTags . length > 0 ) {
314+ tagScore = idfWeightedTagSimilarity ( currentTags , postTags , tagIDF ) ;
315+ } else {
316+ tagScore = jaccardSimilarity ( new Set ( currentTags ) , new Set ( postTags ) ) ;
317+ }
244318
245- // timeFreshnessScore (0-30): 6 个月半衰期
246- const daysSincePublished =
247- ( now - new Date ( post . data . published ) . getTime ( ) ) /
248- ( 1000 * 60 * 60 * 24 ) ;
249- const timeFreshnessScore =
250- 30 * Math . exp ( ( - Math . LN2 * daysSincePublished ) / 180 ) ;
319+ // 标题相似度
320+ const postTokens = tokenize ( post . data . title ) ;
321+ const titleScore = jaccardSimilarity ( currentTokens , postTokens ) ;
322+
323+ // 描述相似度
324+ const postDesc = tokenize ( post . data . description || "" ) ;
325+ const descScore = jaccardSimilarity ( currentDesc , postDesc ) ;
251326
252- // categoryBonus (0 or 10)
327+ // 分类匹配
253328 const postCategory = post . data . category || "" ;
254- const categoryBonus =
255- currentCategory && postCategory && currentCategory === postCategory
256- ? 10
257- : 0 ;
329+ const catScore =
330+ currentCategory && postCategory && currentCategory === postCategory ? 1 : 0 ;
331+
332+ // 时间新鲜度(指数衰减,半衰期可配)
333+ const daysSincePublished =
334+ ( now - new Date ( post . data . published ) . getTime ( ) ) / ( 1000 * 60 * 60 * 24 ) ;
335+ const freshnessScore = Math . exp ( ( - Math . LN2 * daysSincePublished ) / halfLife ) ;
258336
337+ // 加权总分(归一化到 [0, 1])
259338 const totalScore =
260- tagMatchScore +
261- titleSimilarityScore +
262- timeFreshnessScore +
263- categoryBonus ;
264-
265- return {
266- post,
267- totalScore,
268- tagMatchScore,
269- timeFreshnessScore,
270- categoryBonus,
271- } ;
339+ totalWeight === 0
340+ ? 0
341+ : ( tagScore * w . tagSimilarity +
342+ titleScore * w . titleSimilarity +
343+ descScore * w . descriptionSimilarity +
344+ catScore * w . categoryMatch +
345+ freshnessScore * w . freshness ) /
346+ totalWeight ;
347+
348+ return { post, totalScore, tagScore } ;
272349 } ) ;
273350
274351 // 按总分降序排列
275352 scored . sort ( ( a , b ) => b . totalScore - a . totalScore ) ;
276353
277- // 优先取有标签匹配的
278- const withTagMatch = scored . filter ( ( s ) => s . tagMatchScore > 0 ) ;
279- const withoutTagMatch = scored . filter ( ( s ) => s . tagMatchScore === 0 ) ;
354+ // 优先取有标签匹配的,不足时从剩余候选中补充
355+ const withTagMatch = scored . filter ( ( s ) => s . tagScore > 0 ) ;
356+ const withoutTagMatch = scored . filter ( ( s ) => s . tagScore === 0 ) ;
280357
281358 const result : PostForList [ ] = [ ] ;
282359
283360 for ( const s of withTagMatch ) {
284- if ( result . length >= maxCount ) {
285- break ;
286- }
361+ if ( result . length >= maxCount ) break ;
287362 result . push ( { id : s . post . id , data : s . post . data } ) ;
288363 }
289364
290- // 不足时从剩余候选中按 timeFreshnessScore + categoryBonus 降序补充
291- if ( result . length < maxCount ) {
292- withoutTagMatch . sort (
293- ( a , b ) =>
294- b . timeFreshnessScore +
295- b . categoryBonus -
296- ( a . timeFreshnessScore + a . categoryBonus ) ,
297- ) ;
298- for ( const s of withoutTagMatch ) {
299- if ( result . length >= maxCount ) {
300- break ;
301- }
302- result . push ( { id : s . post . id , data : s . post . data } ) ;
303- }
365+ for ( const s of withoutTagMatch ) {
366+ if ( result . length >= maxCount ) break ;
367+ result . push ( { id : s . post . id , data : s . post . data } ) ;
304368 }
305369
306370 return result ;
0 commit comments