Skip to content

Commit 8ffce3d

Browse files
feat: multi-algorithm weighted scoring for related posts
Replace fixed-score formula with configurable weighted scoring system: - Tag similarity (Jaccard or IDF-weighted) - Title token similarity (Jaccard) - Description text similarity - Category match bonus - Freshness decay (configurable half-life) Weights are normalized, so users only need to adjust relative importance without manually summing to 1.0.
1 parent d4f74a7 commit 8ffce3d

3 files changed

Lines changed: 158 additions & 67 deletions

File tree

src/config/relatedPostsConfig.ts

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,19 @@ import type { RelatedPostsConfig } from "../types/config";
44
export const relatedPostsConfig: RelatedPostsConfig = {
55
enable: true,
66
maxCount: 5,
7+
8+
// 评分权重配置 — 各维度权重值归一化后使用,无需手动凑到 1.0
9+
// 调大某个权重 = 该维度在排序中更重要;设为 0 = 忽略该维度
10+
weights: {
11+
tagSimilarity: 1.0, // 标签 Jaccard 相似度(权重最高,核心信号)
12+
titleSimilarity: 0.6, // 标题分词 Jaccard 相似度
13+
descriptionSimilarity: 0.4, // 描述文本分词相似度
14+
categoryMatch: 0.3, // 同分类加分
15+
freshness: 0.2, // 时间新鲜度(越新越高分)
16+
tagIDF: true, // 启用 IDF 加权:稀有标签匹配权重更高,常见标签权重更低
17+
},
18+
19+
// 新鲜度半衰期(天):发表日期距今多少天,新鲜度分数衰减到一半
20+
// 180 ≈ 6个月,90 ≈ 3个月(偏好近期文章),365 ≈ 1年(对时间不敏感)
21+
freshnessHalfLife: 180,
722
};

src/types/config.ts

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -507,6 +507,18 @@ export interface ShareConfig {
507507
export interface RelatedPostsConfig {
508508
enable: boolean; // 是否启用相关文章功能
509509
maxCount: number; // 相关文章数量
510+
weights?: RelatedPostsWeights; // 评分权重配置
511+
freshnessHalfLife?: number; // 新鲜度半衰期(天),默认 180
512+
}
513+
514+
// 相关文章评分权重配置(所有权重归一化后使用)
515+
export interface RelatedPostsWeights {
516+
tagSimilarity?: number; // 标签相似度权重,默认 1.0
517+
titleSimilarity?: number; // 标题相似度权重,默认 0.6
518+
descriptionSimilarity?: number; // 描述相似度权重,默认 0.4
519+
categoryMatch?: number; // 分类匹配权重,默认 0.3
520+
freshness?: number; // 时间新鲜度权重,默认 0.2
521+
tagIDF?: boolean; // 是否启用标签 IDF 加权(稀有标签权重更高),默认 true
510522
}
511523

512524
/**

src/utils/content-utils.ts

Lines changed: 131 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -146,26 +146,23 @@ export async function getCategoryList(): Promise<Category[]> {
146146
}
147147

148148
/**
149-
* 对标题进行分词,支持中英文混合
149+
* 对文本进行分词,支持中英文混合
150150
*
151151
* - 优先使用 Intl.Segmenter(在支持的运行时中效果更好)
152152
* - 在不支持 Segmenter 的环境(如部分 Node 运行时)下
153153
* 回退到基于正则的简单分词,以避免构建报错
154154
* - 过滤标点和空白,英文统一小写
155155
*/
156-
function tokenizeTitle(title: string): Set<string> {
156+
function tokenize(text: string): Set<string> {
157157
const tokens = new Set<string>();
158158

159-
// 运行时可能不支持 Intl.Segmenter(例如部分 Node 环境)
160-
// 为了避免 SSR/构建时报错,这里做兼容处理
161159
const hasSegmenter =
162160
typeof Intl !== "undefined" &&
163161
"Segmenter" in Intl &&
164162
typeof (Intl as any).Segmenter === "function";
165163

166164
if (!hasSegmenter) {
167-
// 简单回退方案:按照空白和标点拆分
168-
const basicTokens = title
165+
const basicTokens = text
169166
.toLowerCase()
170167
.split(/[\s\p{P}]+/gu)
171168
.filter(Boolean);
@@ -175,11 +172,10 @@ function tokenizeTitle(title: string): Set<string> {
175172
return tokens;
176173
}
177174

178-
// 使用 Intl.Segmenter 进行更精细的中英文混合分词
179175
const segmenter = new (Intl as any).Segmenter("zh", {
180176
granularity: "word",
181177
});
182-
for (const { segment, isWordLike } of segmenter.segment(title)) {
178+
for (const { segment, isWordLike } of segmenter.segment(text)) {
183179
if (!isWordLike) {
184180
continue;
185181
}
@@ -206,17 +202,85 @@ function jaccardSimilarity(a: Set<string>, b: Set<string>): number {
206202
}
207203

208204
/**
209-
* 获取相关文章推荐
210-
* 评分公式: totalScore = tagMatchScore + titleSimilarityScore + timeFreshnessScore + categoryBonus
211-
* - tagMatchScore (0-100): 标签 Jaccard 相似度 × 100
212-
* - titleSimilarityScore (0-100): 标题分词 Jaccard 相似度 × 100
213-
* - timeFreshnessScore (0-30): 6 个月半衰期指数衰减
214-
* - categoryBonus (0 or 10): 同分类加 10 分
205+
* 计算标签的 IDF(逆文档频率)权重
206+
* 稀有标签(出现频率低)获得更高权重,常见标签权重更低
207+
* IDF(tag) = log(N / (1 + df(tag))),N = 总文章数,df = 包含该标签的文章数
208+
*/
209+
function computeTagIDF(allPosts: { data: { tags?: string[] } }[]): Map<string, number> {
210+
const tagDF = new Map<string, number>();
211+
const N = allPosts.length;
212+
213+
for (const post of allPosts) {
214+
const tags = post.data.tags || [];
215+
for (const tag of tags) {
216+
tagDF.set(tag, (tagDF.get(tag) || 0) + 1);
217+
}
218+
}
219+
220+
const tagIDF = new Map<string, number>();
221+
for (const [tag, df] of tagDF) {
222+
tagIDF.set(tag, Math.log(N / (1 + df)));
223+
}
224+
return tagIDF;
225+
}
226+
227+
/**
228+
* 计算 IDF 加权标签相似度
229+
* 对共有标签的 IDF 值求和,归一化到 [0, 1]
230+
*/
231+
function idfWeightedTagSimilarity(
232+
currentTags: string[],
233+
candidateTags: string[],
234+
tagIDF: Map<string, number>,
235+
): number {
236+
if (currentTags.length === 0 || candidateTags.length === 0) {
237+
return 0;
238+
}
239+
240+
const candidateSet = new Set(candidateTags);
241+
let intersectionWeight = 0;
242+
let currentTotalWeight = 0;
243+
244+
for (const tag of currentTags) {
245+
const idf = tagIDF.get(tag) ?? 0;
246+
currentTotalWeight += idf;
247+
if (candidateSet.has(tag)) {
248+
intersectionWeight += idf;
249+
}
250+
}
251+
252+
return currentTotalWeight === 0 ? 0 : intersectionWeight / currentTotalWeight;
253+
}
254+
255+
/**
256+
* 获取相关文章推荐 — 多算法加权评分
257+
*
258+
* 评分维度(权重可通过 relatedPostsConfig.weights 配置):
259+
* - tagSimilarity: 标签相似度(Jaccard 或 IDF 加权)
260+
* - titleSimilarity: 标题分词 Jaccard 相似度
261+
* - descriptionSimilarity: 描述文本分词相似度
262+
* - categoryMatch: 同分类加分
263+
* - freshness: 时间新鲜度(指数衰减)
264+
*
265+
* 总分 = Σ(维度分数 × 权重) / Σ权重
215266
*/
216267
export async function getRelatedPosts(
217268
currentPost: CollectionEntry<"posts">,
218269
maxCount = 5,
219270
): Promise<PostForList[]> {
271+
const { relatedPostsConfig } = await import("../config/index.js");
272+
const weights = relatedPostsConfig.weights ?? {};
273+
const halfLife = relatedPostsConfig.freshnessHalfLife ?? 180;
274+
275+
const w = {
276+
tagSimilarity: weights.tagSimilarity ?? 1.0,
277+
titleSimilarity: weights.titleSimilarity ?? 0.6,
278+
descriptionSimilarity: weights.descriptionSimilarity ?? 0.4,
279+
categoryMatch: weights.categoryMatch ?? 0.3,
280+
freshness: weights.freshness ?? 0.2,
281+
useIDF: weights.tagIDF ?? true,
282+
};
283+
220284
const allPosts = await getCollection<"posts">("posts", ({ data }) => {
221285
return import.meta.env.PROD ? data.draft !== true : true;
222286
});
@@ -226,81 +290,81 @@ export async function getRelatedPosts(
226290
(p) => p.id !== currentPost.id && !p.data.password,
227291
);
228292

229-
const currentTags = new Set(currentPost.data.tags || []);
230-
const currentTokens = tokenizeTitle(currentPost.data.title);
293+
if (candidates.length === 0) return [];
294+
295+
const currentTags = currentPost.data.tags || [];
296+
const currentTokens = tokenize(currentPost.data.title);
297+
const currentDesc = tokenize(currentPost.data.description || "");
231298
const currentCategory = currentPost.data.category || "";
232299
const now = Date.now();
233300

234-
const scored = candidates.map((post) => {
235-
const postTags = new Set(post.data.tags || []);
301+
// 预计算标签 IDF
302+
const tagIDF = w.useIDF ? computeTagIDF(allPosts) : new Map<string, number>();
236303

237-
// tagMatchScore (0-100)
238-
const tagMatchScore = jaccardSimilarity(currentTags, postTags) * 100;
304+
// 权重总和(用于归一化)
305+
const totalWeight =
306+
w.tagSimilarity + w.titleSimilarity + w.descriptionSimilarity + w.categoryMatch + w.freshness;
239307

240-
// titleSimilarityScore (0-100)
241-
const postTokens = tokenizeTitle(post.data.title);
242-
const titleSimilarityScore =
243-
jaccardSimilarity(currentTokens, postTokens) * 100;
308+
const scored = candidates.map((post) => {
309+
const postTags = post.data.tags || [];
310+
311+
// 标签相似度
312+
let tagScore: number;
313+
if (w.useIDF && currentTags.length > 0 && postTags.length > 0) {
314+
tagScore = idfWeightedTagSimilarity(currentTags, postTags, tagIDF);
315+
} else {
316+
tagScore = jaccardSimilarity(new Set(currentTags), new Set(postTags));
317+
}
244318

245-
// timeFreshnessScore (0-30): 6 个月半衰期
246-
const daysSincePublished =
247-
(now - new Date(post.data.published).getTime()) /
248-
(1000 * 60 * 60 * 24);
249-
const timeFreshnessScore =
250-
30 * Math.exp((-Math.LN2 * daysSincePublished) / 180);
319+
// 标题相似度
320+
const postTokens = tokenize(post.data.title);
321+
const titleScore = jaccardSimilarity(currentTokens, postTokens);
322+
323+
// 描述相似度
324+
const postDesc = tokenize(post.data.description || "");
325+
const descScore = jaccardSimilarity(currentDesc, postDesc);
251326

252-
// categoryBonus (0 or 10)
327+
// 分类匹配
253328
const postCategory = post.data.category || "";
254-
const categoryBonus =
255-
currentCategory && postCategory && currentCategory === postCategory
256-
? 10
257-
: 0;
329+
const catScore =
330+
currentCategory && postCategory && currentCategory === postCategory ? 1 : 0;
331+
332+
// 时间新鲜度(指数衰减,半衰期可配)
333+
const daysSincePublished =
334+
(now - new Date(post.data.published).getTime()) / (1000 * 60 * 60 * 24);
335+
const freshnessScore = Math.exp((-Math.LN2 * daysSincePublished) / halfLife);
258336

337+
// 加权总分(归一化到 [0, 1])
259338
const totalScore =
260-
tagMatchScore +
261-
titleSimilarityScore +
262-
timeFreshnessScore +
263-
categoryBonus;
264-
265-
return {
266-
post,
267-
totalScore,
268-
tagMatchScore,
269-
timeFreshnessScore,
270-
categoryBonus,
271-
};
339+
totalWeight === 0
340+
? 0
341+
: (tagScore * w.tagSimilarity +
342+
titleScore * w.titleSimilarity +
343+
descScore * w.descriptionSimilarity +
344+
catScore * w.categoryMatch +
345+
freshnessScore * w.freshness) /
346+
totalWeight;
347+
348+
return { post, totalScore, tagScore };
272349
});
273350

274351
// 按总分降序排列
275352
scored.sort((a, b) => b.totalScore - a.totalScore);
276353

277-
// 优先取有标签匹配的
278-
const withTagMatch = scored.filter((s) => s.tagMatchScore > 0);
279-
const withoutTagMatch = scored.filter((s) => s.tagMatchScore === 0);
354+
// 优先取有标签匹配的,不足时从剩余候选中补充
355+
const withTagMatch = scored.filter((s) => s.tagScore > 0);
356+
const withoutTagMatch = scored.filter((s) => s.tagScore === 0);
280357

281358
const result: PostForList[] = [];
282359

283360
for (const s of withTagMatch) {
284-
if (result.length >= maxCount) {
285-
break;
286-
}
361+
if (result.length >= maxCount) break;
287362
result.push({ id: s.post.id, data: s.post.data });
288363
}
289364

290-
// 不足时从剩余候选中按 timeFreshnessScore + categoryBonus 降序补充
291-
if (result.length < maxCount) {
292-
withoutTagMatch.sort(
293-
(a, b) =>
294-
b.timeFreshnessScore +
295-
b.categoryBonus -
296-
(a.timeFreshnessScore + a.categoryBonus),
297-
);
298-
for (const s of withoutTagMatch) {
299-
if (result.length >= maxCount) {
300-
break;
301-
}
302-
result.push({ id: s.post.id, data: s.post.data });
303-
}
365+
for (const s of withoutTagMatch) {
366+
if (result.length >= maxCount) break;
367+
result.push({ id: s.post.id, data: s.post.data });
304368
}
305369

306370
return result;

0 commit comments

Comments
 (0)