|
| 1 | +/** |
| 2 | + * 从 Jisho API 批量抓取 JLPT 词汇并导入 SQLite |
| 3 | + * 用法: node fetch-words.js [--levels n5,n4,n3,n2,n1] [--delay 1500] |
| 4 | + * 默认抓取 N5, N4, N3 |
| 5 | + */ |
| 6 | +import https from 'https'; |
| 7 | +import * as wanakana from 'wanakana'; |
| 8 | +import { bulkInsert, getWordCount } from './db.js'; |
| 9 | + |
| 10 | +const args = process.argv.slice(2); |
| 11 | +const levelArg = args.find(a => a.startsWith('--levels=')); |
| 12 | +const delayArg = args.find(a => a.startsWith('--delay=')); |
| 13 | + |
| 14 | +const levels = levelArg |
| 15 | + ? levelArg.split('=')[1].split(',').map(l => l.trim()) |
| 16 | + : ['n5', 'n4', 'n3']; |
| 17 | +const DELAY_MS = delayArg ? parseInt(delayArg.split('=')[1]) : 1500; |
| 18 | +const MAX_PAGES_PER_LEVEL = 50; // 安全上限,每级最多 50 页 × 20 条 = 1000 词 |
| 19 | + |
| 20 | +function sleep(ms) { |
| 21 | + return new Promise(resolve => setTimeout(resolve, ms)); |
| 22 | +} |
| 23 | + |
| 24 | +function fetchPage(keyword, page) { |
| 25 | + return new Promise((resolve, reject) => { |
| 26 | + const url = `https://jisho.org/api/v1/search/words?keyword=${encodeURIComponent(keyword)}&page=${page}`; |
| 27 | + https.get(url, { timeout: 15000 }, (res) => { |
| 28 | + let data = ''; |
| 29 | + res.on('data', chunk => data += chunk); |
| 30 | + res.on('end', () => { |
| 31 | + try { |
| 32 | + resolve(JSON.parse(data)); |
| 33 | + } catch (e) { |
| 34 | + reject(new Error(`JSON parse error: ${e.message}`)); |
| 35 | + } |
| 36 | + }); |
| 37 | + }).on('error', reject) |
| 38 | + .on('timeout', function() { this.destroy(); reject(new Error('Request timeout')); }); |
| 39 | + }); |
| 40 | +} |
| 41 | + |
| 42 | +function parseJishoItem(item) { |
| 43 | + if (!item.japanese || item.japanese.length === 0) return null; |
| 44 | + |
| 45 | + const japanese = item.japanese[0]; |
| 46 | + const kanji = japanese.word || japanese.reading; |
| 47 | + const kana = japanese.reading || kanji; |
| 48 | + if (!kanji || !kana) return null; |
| 49 | + |
| 50 | + let wordType = 'other'; |
| 51 | + const meanings = []; |
| 52 | + |
| 53 | + for (const sense of (item.senses || [])) { |
| 54 | + const pos = (sense.parts_of_speech || []).join(' ').toLowerCase(); |
| 55 | + if (wordType === 'other') { |
| 56 | + if (pos.includes('verb')) wordType = 'verb'; |
| 57 | + else if (pos.includes('i-adjective')) wordType = 'i-adjective'; |
| 58 | + else if (pos.includes('na-adjective')) wordType = 'na-adjective'; |
| 59 | + else if (pos.includes('noun')) wordType = 'noun'; |
| 60 | + else if (pos.includes('adverb')) wordType = 'adverb'; |
| 61 | + else if (pos.includes('particle')) wordType = 'particle'; |
| 62 | + else if (pos.includes('conjunction')) wordType = 'conjunction'; |
| 63 | + else if (pos.includes('interjection')) wordType = 'interjection'; |
| 64 | + else if (pos.includes('prefix')) wordType = 'prefix'; |
| 65 | + else if (pos.includes('suffix')) wordType = 'suffix'; |
| 66 | + else if (pos.includes('counter')) wordType = 'counter'; |
| 67 | + else if (pos.includes('pronoun')) wordType = 'pronoun'; |
| 68 | + else if (pos.includes('expression')) wordType = 'expression'; |
| 69 | + } |
| 70 | + const defs = (sense.english_definitions || []).slice(0, 3).join(', '); |
| 71 | + if (defs) meanings.push(defs); |
| 72 | + } |
| 73 | + |
| 74 | + if (wordType === 'other' && meanings.length === 0) return null; |
| 75 | + |
| 76 | + const jlptArr = item.jlpt || []; |
| 77 | + const jlpt = jlptArr.length > 0 ? jlptArr[0].replace('jlpt-', '').toUpperCase() : ''; |
| 78 | + |
| 79 | + return { |
| 80 | + kanji, |
| 81 | + kana, |
| 82 | + romaji: wanakana.toRomaji(kana), |
| 83 | + meaning: meanings.slice(0, 2).join('; '), |
| 84 | + wordType: wordType === 'other' ? 'noun' : wordType, // 有 JLPT 标记但无法识别词性的默认归为名词 |
| 85 | + jlpt, |
| 86 | + isCommon: item.is_common ? 1 : 0 |
| 87 | + }; |
| 88 | +} |
| 89 | + |
| 90 | +async function fetchLevel(level) { |
| 91 | + const keyword = `#jlpt-${level}`; |
| 92 | + let page = 1; |
| 93 | + let totalFetched = 0; |
| 94 | + const words = []; |
| 95 | + |
| 96 | + while (page <= MAX_PAGES_PER_LEVEL) { |
| 97 | + process.stdout.write(` 📡 JLPT ${level.toUpperCase()} - 第 ${page} 页... `); |
| 98 | + |
| 99 | + let result; |
| 100 | + let retries = 0; |
| 101 | + while (retries < 3) { |
| 102 | + try { |
| 103 | + result = await fetchPage(keyword, page); |
| 104 | + break; |
| 105 | + } catch (e) { |
| 106 | + retries++; |
| 107 | + console.log(`⚠️ 重试 ${retries}/3: ${e.message}`); |
| 108 | + await sleep(3000); |
| 109 | + } |
| 110 | + } |
| 111 | + |
| 112 | + if (!result || !result.data || result.data.length === 0) { |
| 113 | + console.log('✅ 无更多数据'); |
| 114 | + break; |
| 115 | + } |
| 116 | + |
| 117 | + let pageCount = 0; |
| 118 | + for (const item of result.data) { |
| 119 | + const word = parseJishoItem(item); |
| 120 | + if (word) { |
| 121 | + words.push(word); |
| 122 | + pageCount++; |
| 123 | + } |
| 124 | + } |
| 125 | + |
| 126 | + totalFetched += pageCount; |
| 127 | + console.log(`${pageCount} 条 (累计 ${totalFetched})`); |
| 128 | + |
| 129 | + page++; |
| 130 | + await sleep(DELAY_MS); |
| 131 | + } |
| 132 | + |
| 133 | + return words; |
| 134 | +} |
| 135 | + |
| 136 | +async function main() { |
| 137 | + const beforeCount = getWordCount(); |
| 138 | + console.log(`📊 当前数据库: ${beforeCount} 条词汇\n`); |
| 139 | + console.log(`🎯 准备抓取 JLPT 等级: ${levels.map(l => l.toUpperCase()).join(', ')}`); |
| 140 | + console.log(`⏱️ 请求间隔: ${DELAY_MS}ms\n`); |
| 141 | + |
| 142 | + let allWords = []; |
| 143 | + |
| 144 | + for (const level of levels) { |
| 145 | + console.log(`\n── JLPT ${level.toUpperCase()} ──`); |
| 146 | + const words = await fetchLevel(level); |
| 147 | + allWords = allWords.concat(words); |
| 148 | + console.log(` 📦 ${level.toUpperCase()} 共获取 ${words.length} 条\n`); |
| 149 | + } |
| 150 | + |
| 151 | + console.log(`\n📥 正在批量导入 ${allWords.length} 条词汇...`); |
| 152 | + |
| 153 | + // 分批导入,每批 200 条 |
| 154 | + const BATCH_SIZE = 200; |
| 155 | + for (let i = 0; i < allWords.length; i += BATCH_SIZE) { |
| 156 | + const batch = allWords.slice(i, i + BATCH_SIZE); |
| 157 | + bulkInsert(batch); |
| 158 | + process.stdout.write(` 已导入 ${Math.min(i + BATCH_SIZE, allWords.length)}/${allWords.length}\r`); |
| 159 | + } |
| 160 | + |
| 161 | + const afterCount = getWordCount(); |
| 162 | + console.log(`\n\n✅ 导入完成!`); |
| 163 | + console.log(` 导入前: ${beforeCount} 条`); |
| 164 | + console.log(` 导入后: ${afterCount} 条`); |
| 165 | + console.log(` 新增: ${afterCount - beforeCount} 条(重复词汇已自动跳过)`); |
| 166 | +} |
| 167 | + |
| 168 | +main().catch(e => { |
| 169 | + console.error('❌ 抓取失败:', e); |
| 170 | + process.exit(1); |
| 171 | +}); |
0 commit comments