Skip to content

Commit bc7dab5

Browse files
许君山许君山
authored andcommitted
feat: v1.2.0 - SQLite local dictionary, dual-track suggest, TTS, furigana dedup
- Add SQLite (better-sqlite3) local dictionary with 2181 JLPT N5-N3 words - Implement dual-track suggest: instant local results + async remote Jisho fallback - Auto-cache Jisho results to SQLite for subsequent instant lookups - Add local-first lookup in /api/conjugate for non-verb words (17s → 45ms) - Add browser SpeechSynthesis TTS with speak buttons across UI - Fix furigana duplication when AI embeds inline readings (夜よる → 夜(よる)) - Add JLPT batch fetch script (fetch-words.js) for dictionary expansion - Update .gitignore for SQLite WAL files
1 parent 1086bb3 commit bc7dab5

10 files changed

Lines changed: 1800 additions & 216 deletions

File tree

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,5 +104,7 @@ temp/
104104

105105
# Database
106106
*.db
107+
*.db-shm
108+
*.db-wal
107109
*.sqlite
108110
*.sqlite3

backend/common-words.json

Lines changed: 370 additions & 0 deletions
Large diffs are not rendered by default.

backend/db.js

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
import Database from 'better-sqlite3';
2+
import path from 'path';
3+
import { fileURLToPath } from 'url';
4+
5+
const __filename = fileURLToPath(import.meta.url);
6+
const __dirname = path.dirname(__filename);
7+
8+
const dbPath = path.join(__dirname, 'dictionary.db');
9+
const db = new Database(dbPath);
10+
11+
// 性能优化
12+
db.pragma('journal_mode = WAL');
13+
db.pragma('synchronous = NORMAL');
14+
15+
// 建表
16+
db.exec(`
17+
CREATE TABLE IF NOT EXISTS words (
18+
id INTEGER PRIMARY KEY AUTOINCREMENT,
19+
kanji TEXT NOT NULL,
20+
kana TEXT NOT NULL,
21+
romaji TEXT NOT NULL,
22+
meaning TEXT NOT NULL DEFAULT '',
23+
word_type TEXT NOT NULL DEFAULT 'other',
24+
jlpt TEXT DEFAULT '',
25+
is_common INTEGER DEFAULT 0,
26+
UNIQUE(kanji, kana)
27+
)
28+
`);
29+
30+
// 创建索引
31+
db.exec(`
32+
CREATE INDEX IF NOT EXISTS idx_words_kanji ON words(kanji);
33+
CREATE INDEX IF NOT EXISTS idx_words_kana ON words(kana);
34+
CREATE INDEX IF NOT EXISTS idx_words_romaji ON words(romaji);
35+
CREATE INDEX IF NOT EXISTS idx_words_meaning ON words(meaning);
36+
CREATE INDEX IF NOT EXISTS idx_words_type ON words(word_type);
37+
`);
38+
39+
// 查询函数:多字段 LIKE 模糊匹配
40+
const searchStmt = db.prepare(`
41+
SELECT kanji, kana, romaji, meaning, word_type AS wordType, jlpt, is_common AS isCommon
42+
FROM words
43+
WHERE kanji LIKE ? OR kana LIKE ? OR romaji LIKE ? OR meaning LIKE ?
44+
LIMIT ?
45+
`);
46+
47+
export function searchWords(query, limit = 8) {
48+
const pattern = `%${query}%`;
49+
return searchStmt.all(pattern, pattern, pattern, pattern, limit);
50+
}
51+
52+
// 精确查找(用于 conjugate 端点查 meaning/reading)
53+
const findExactStmt = db.prepare(`
54+
SELECT kanji, kana, romaji, meaning, word_type AS wordType, jlpt, is_common AS isCommon
55+
FROM words
56+
WHERE kanji = ? OR kana = ?
57+
LIMIT 1
58+
`);
59+
60+
export function findWord(keyword) {
61+
return findExactStmt.get(keyword, keyword) || null;
62+
}
63+
64+
// 插入函数
65+
const insertStmt = db.prepare(`
66+
INSERT OR IGNORE INTO words (kanji, kana, romaji, meaning, word_type, jlpt, is_common)
67+
VALUES (?, ?, ?, ?, ?, ?, ?)
68+
`);
69+
70+
export function insertWord(word) {
71+
return insertStmt.run(
72+
word.kanji,
73+
word.kana,
74+
word.romaji,
75+
word.meaning || '',
76+
word.wordType || word.word_type || 'other',
77+
word.jlpt || '',
78+
word.isCommon || word.is_common ? 1 : 0
79+
);
80+
}
81+
82+
// 批量插入(事务)
83+
const insertMany = db.transaction((words) => {
84+
for (const w of words) {
85+
insertWord(w);
86+
}
87+
});
88+
89+
export function bulkInsert(words) {
90+
insertMany(words);
91+
}
92+
93+
// 获取词条数量
94+
export function getWordCount() {
95+
return db.prepare('SELECT COUNT(*) AS count FROM words').get().count;
96+
}
97+
98+
export default db;

backend/fetch-words.js

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
/**
2+
* 从 Jisho API 批量抓取 JLPT 词汇并导入 SQLite
3+
* 用法: node fetch-words.js [--levels n5,n4,n3,n2,n1] [--delay 1500]
4+
* 默认抓取 N5, N4, N3
5+
*/
6+
import https from 'https';
7+
import * as wanakana from 'wanakana';
8+
import { bulkInsert, getWordCount } from './db.js';
9+
10+
const args = process.argv.slice(2);
11+
const levelArg = args.find(a => a.startsWith('--levels='));
12+
const delayArg = args.find(a => a.startsWith('--delay='));
13+
14+
const levels = levelArg
15+
? levelArg.split('=')[1].split(',').map(l => l.trim())
16+
: ['n5', 'n4', 'n3'];
17+
const DELAY_MS = delayArg ? parseInt(delayArg.split('=')[1]) : 1500;
18+
const MAX_PAGES_PER_LEVEL = 50; // 安全上限,每级最多 50 页 × 20 条 = 1000 词
19+
20+
function sleep(ms) {
21+
return new Promise(resolve => setTimeout(resolve, ms));
22+
}
23+
24+
function fetchPage(keyword, page) {
25+
return new Promise((resolve, reject) => {
26+
const url = `https://jisho.org/api/v1/search/words?keyword=${encodeURIComponent(keyword)}&page=${page}`;
27+
https.get(url, { timeout: 15000 }, (res) => {
28+
let data = '';
29+
res.on('data', chunk => data += chunk);
30+
res.on('end', () => {
31+
try {
32+
resolve(JSON.parse(data));
33+
} catch (e) {
34+
reject(new Error(`JSON parse error: ${e.message}`));
35+
}
36+
});
37+
}).on('error', reject)
38+
.on('timeout', function() { this.destroy(); reject(new Error('Request timeout')); });
39+
});
40+
}
41+
42+
function parseJishoItem(item) {
43+
if (!item.japanese || item.japanese.length === 0) return null;
44+
45+
const japanese = item.japanese[0];
46+
const kanji = japanese.word || japanese.reading;
47+
const kana = japanese.reading || kanji;
48+
if (!kanji || !kana) return null;
49+
50+
let wordType = 'other';
51+
const meanings = [];
52+
53+
for (const sense of (item.senses || [])) {
54+
const pos = (sense.parts_of_speech || []).join(' ').toLowerCase();
55+
if (wordType === 'other') {
56+
if (pos.includes('verb')) wordType = 'verb';
57+
else if (pos.includes('i-adjective')) wordType = 'i-adjective';
58+
else if (pos.includes('na-adjective')) wordType = 'na-adjective';
59+
else if (pos.includes('noun')) wordType = 'noun';
60+
else if (pos.includes('adverb')) wordType = 'adverb';
61+
else if (pos.includes('particle')) wordType = 'particle';
62+
else if (pos.includes('conjunction')) wordType = 'conjunction';
63+
else if (pos.includes('interjection')) wordType = 'interjection';
64+
else if (pos.includes('prefix')) wordType = 'prefix';
65+
else if (pos.includes('suffix')) wordType = 'suffix';
66+
else if (pos.includes('counter')) wordType = 'counter';
67+
else if (pos.includes('pronoun')) wordType = 'pronoun';
68+
else if (pos.includes('expression')) wordType = 'expression';
69+
}
70+
const defs = (sense.english_definitions || []).slice(0, 3).join(', ');
71+
if (defs) meanings.push(defs);
72+
}
73+
74+
if (wordType === 'other' && meanings.length === 0) return null;
75+
76+
const jlptArr = item.jlpt || [];
77+
const jlpt = jlptArr.length > 0 ? jlptArr[0].replace('jlpt-', '').toUpperCase() : '';
78+
79+
return {
80+
kanji,
81+
kana,
82+
romaji: wanakana.toRomaji(kana),
83+
meaning: meanings.slice(0, 2).join('; '),
84+
wordType: wordType === 'other' ? 'noun' : wordType, // 有 JLPT 标记但无法识别词性的默认归为名词
85+
jlpt,
86+
isCommon: item.is_common ? 1 : 0
87+
};
88+
}
89+
90+
async function fetchLevel(level) {
91+
const keyword = `#jlpt-${level}`;
92+
let page = 1;
93+
let totalFetched = 0;
94+
const words = [];
95+
96+
while (page <= MAX_PAGES_PER_LEVEL) {
97+
process.stdout.write(` 📡 JLPT ${level.toUpperCase()} - 第 ${page} 页... `);
98+
99+
let result;
100+
let retries = 0;
101+
while (retries < 3) {
102+
try {
103+
result = await fetchPage(keyword, page);
104+
break;
105+
} catch (e) {
106+
retries++;
107+
console.log(`⚠️ 重试 ${retries}/3: ${e.message}`);
108+
await sleep(3000);
109+
}
110+
}
111+
112+
if (!result || !result.data || result.data.length === 0) {
113+
console.log('✅ 无更多数据');
114+
break;
115+
}
116+
117+
let pageCount = 0;
118+
for (const item of result.data) {
119+
const word = parseJishoItem(item);
120+
if (word) {
121+
words.push(word);
122+
pageCount++;
123+
}
124+
}
125+
126+
totalFetched += pageCount;
127+
console.log(`${pageCount} 条 (累计 ${totalFetched})`);
128+
129+
page++;
130+
await sleep(DELAY_MS);
131+
}
132+
133+
return words;
134+
}
135+
136+
async function main() {
137+
const beforeCount = getWordCount();
138+
console.log(`📊 当前数据库: ${beforeCount} 条词汇\n`);
139+
console.log(`🎯 准备抓取 JLPT 等级: ${levels.map(l => l.toUpperCase()).join(', ')}`);
140+
console.log(`⏱️ 请求间隔: ${DELAY_MS}ms\n`);
141+
142+
let allWords = [];
143+
144+
for (const level of levels) {
145+
console.log(`\n── JLPT ${level.toUpperCase()} ──`);
146+
const words = await fetchLevel(level);
147+
allWords = allWords.concat(words);
148+
console.log(` 📦 ${level.toUpperCase()} 共获取 ${words.length} 条\n`);
149+
}
150+
151+
console.log(`\n📥 正在批量导入 ${allWords.length} 条词汇...`);
152+
153+
// 分批导入,每批 200 条
154+
const BATCH_SIZE = 200;
155+
for (let i = 0; i < allWords.length; i += BATCH_SIZE) {
156+
const batch = allWords.slice(i, i + BATCH_SIZE);
157+
bulkInsert(batch);
158+
process.stdout.write(` 已导入 ${Math.min(i + BATCH_SIZE, allWords.length)}/${allWords.length}\r`);
159+
}
160+
161+
const afterCount = getWordCount();
162+
console.log(`\n\n✅ 导入完成!`);
163+
console.log(` 导入前: ${beforeCount} 条`);
164+
console.log(` 导入后: ${afterCount} 条`);
165+
console.log(` 新增: ${afterCount - beforeCount} 条(重复词汇已自动跳过)`);
166+
}
167+
168+
main().catch(e => {
169+
console.error('❌ 抓取失败:', e);
170+
process.exit(1);
171+
});

backend/import-words.js

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
import fs from 'fs';
2+
import path from 'path';
3+
import { fileURLToPath } from 'url';
4+
import { bulkInsert, getWordCount } from './db.js';
5+
6+
const __filename = fileURLToPath(import.meta.url);
7+
const __dirname = path.dirname(__filename);
8+
9+
// 读取 common-words.json
10+
const wordsPath = path.join(__dirname, 'common-words.json');
11+
const words = JSON.parse(fs.readFileSync(wordsPath, 'utf8'));
12+
13+
console.log(`📦 读取到 ${words.length} 条词汇,正在导入...`);
14+
15+
// 批量插入
16+
bulkInsert(words);
17+
18+
const count = getWordCount();
19+
console.log(`✅ 导入完成!数据库中共 ${count} 条词汇。`);

0 commit comments

Comments
 (0)