Skip to content
Open
1 change: 0 additions & 1 deletion ext/js/background/backend.js
Original file line number Diff line number Diff line change
Expand Up @@ -1740,7 +1740,6 @@ export class Backend {
}
result.push(termParts);
}
result.push([{text: '\n', reading: ''}]);
}
results.push([name, result]);
}
Expand Down
92 changes: 89 additions & 3 deletions ext/js/comm/mecab.js
Original file line number Diff line number Diff line change
Expand Up @@ -206,15 +206,101 @@ export class Mecab {
/** @type {import('mecab').ParseResult[]} */
const results = [];
for (const [name, rawLines] of Object.entries(rawResults)) {
// Define helper functions based on dictionary type
let ignoreReading, isNoun, isProperNoun, isCopula, isAuxVerb, isContinuativeForm, isVerbSuffix, isTatteParticle, isBaParticle, isTeDeParticle, isTaDaParticle, isVerb, isVerbNonIndependent, isNounSuffix, isCounter, isNumeral;

if (name === 'unidic-mecab-translate') {
// Helper functions for unidic-mecab-translate
ignoreReading = (tok) => tok.pos1 === 'symbol' && tok.pos2 === 'character';
isNoun = (tok) => tok.pos1 === 'noun';
isCopula = (tok) => tok.inflection_type === 'aux|da' || tok.inflection_type === 'aux|desu';
isAuxVerb = (tok) => (tok.pos1 === 'aux' || tok.pos1 === 'aux-verb') && !isCopula(tok);
isContinuativeForm = (tok) => (tok.inflection_form.startsWith('continuative'));
isVerbSuffix = (tok) => tok.pos1 === 'suffix';
isTatteParticle = (tok) => tok.pos1 === 'particle' && tok.pos2 === 'conjunctive' && (tok.lemma === 'たって');
isBaParticle = (tok) => tok.pos1 === 'particle' && tok.pos2 === 'conjunctive' && (tok.term === 'ば');
isTeDeParticle = (tok) => tok.pos1 === 'particle' && tok.pos2 === 'conjunctive' && tok.lemma === 'て';
isTaDaParticle = (tok) => isAuxVerb(tok) && (tok.term === 'た' || tok.term === 'だ');
isVerb = (tok) => tok.pos1 === 'verb' || (tok.pos1 === 'aux' || tok.pos1 === 'aux-verb');
isVerbNonIndependent = (tok) => isVerb(tok) && tok.pos2 === 'nonindependent?';
isProperNoun = (tok) => tok.pos1 === 'noun' && tok.pos2 === 'proper';
isNounSuffix = (tok) => tok.pos1 === 'suffix' && tok.pos2 === 'substantive';
isCounter = (tok) => tok.pos1 === 'noun' && tok.pos2 === 'common' && tok.pos3 === 'counter?';
isNumeral = (tok) => tok.pos1 === 'noun' && tok.pos2 === 'numeral';
} else {
// Helper functions for ipadic and other dictionaries
ignoreReading = (tok) => tok.pos1 === '記号' && tok.pos2 === '文字';
isNoun = (tok) => tok.pos1 === '名詞';
const isCopulaIpadic = (tok) => tok.inflection_type === '特殊|だ' || tok.inflection_type === '特殊|デス';
const isCopulaUnidic = (tok) => tok.inflection_type === '助動詞-ダ' || tok.inflection_type === '助動詞-デス';
isCopula = (tok) => isCopulaIpadic(tok) || isCopulaUnidic(tok);
isAuxVerb = (tok) => tok.pos1 === '助動詞' && !isCopula(tok);
isContinuativeForm = (tok) => (tok.inflection_form === '連用デ接続' || tok.inflection_form === '連用タ接続' || tok.inflection_form.startsWith('連用形')) && (tok.reading !== 'ない');
// 待ってるじゃないです : てる is 動詞,非自立,*,*,一段,基本形,てる,テル,テル
// やられる : れる is 動詞,接尾,*,*,一段,基本形,れる,レル,レル
const isVerbSuffixIpadic = (tok) => tok.pos1 === '動詞' && (tok.pos2 === '非自立' || tok.pos2 === '接尾');
const isVerbSuffixUnidic = (tok) => tok.pos1 === '接尾辞' && (tok.pos2 === '形容詞的');
isVerbSuffix = (tok) => isVerbSuffixUnidic(tok) || isVerbSuffixIpadic(tok);
isTatteParticle = (tok) => tok.pos1 === '助詞' && tok.pos2 === '接続助詞' && (tok.lemma === 'たって');
isBaParticle = (tok) => tok.pos1 === '助詞' && tok.pos2 === '接続助詞' && (tok.term === 'ば');
isTeDeParticle = (tok) => tok.pos1 === '助詞' && tok.pos2 === '接続助詞' && (tok.term === 'て' || tok.term === 'で' || tok.term === 'ちゃ'); // cha doesn't have a lemma in ipadic
isTaDaParticle = (tok) => isAuxVerb(tok) && (tok.term === 'た' || tok.term === 'だ');
isVerb = (tok) => tok.pos1 === '動詞' || tok.pos1 === '助動詞';
isVerbNonIndependent = (_) => true;
isProperNoun = (tok) => tok.pos1 === '名詞' && tok.pos2 === '固有名詞';
const isNounSuffixIpadic = (tok) => tok.pos1 === '動詞' && tok.pos2 === '接尾';
const isNounSuffixUnidic = (tok) => tok.pos1 === '接尾辞' && tok.pos2 === '名詞的';
isNounSuffix = (tok) => isNounSuffixIpadic(tok) || isNounSuffixUnidic(tok);
isCounter = (tok) => tok.pos1 === '名詞' && tok.pos3.startsWith('助数詞');
isNumeral = (tok) => tok.pos1 === '名詞' && tok.pos2.startsWith('数');
}

/** @type {import('mecab').ParseFragment[][]} */
const lines = [];
let last_standalone_token;

for (const rawLine of rawLines) {
/** @type {import('mecab').ParseFragment[]} */
const line = [];
for (let {expression: term, reading, source} of rawLine) {

for (let {expression: term, reading, source, pos1, pos2, pos3, pos4, inflection_type, inflection_form, lemma} of rawLine) {
if (typeof term !== 'string') { term = ''; }
if (typeof reading !== 'string') { reading = ''; }
if ((typeof reading !== 'string')) { reading = ''; }
if (typeof source !== 'string') { source = ''; }
line.push({term, reading, source});
if (typeof pos1 !== 'string') { pos1 = ''; }
if (typeof pos2 !== 'string') { pos2 = ''; }
if (typeof pos3 !== 'string') { pos3 = ''; }
if (typeof pos4 !== 'string') { pos4 = ''; }
if (typeof inflection_type !== 'string') { inflection_type = ''; }
if (typeof lemma !== 'string') { lemma = ''; }

const token = {term, reading, source, pos1, pos2, pos3, pos4, inflection_type, inflection_form, lemma};

if (ignoreReading(token)) {
token.reading = '';
}

let result_token = token;
let should_merge;
if (line.length > 0) {
const last_result_token = line[line.length - 1];
// eslint-disable-next-line @typescript-eslint/no-unsafe-assignment
should_merge = (isVerb(last_standalone_token) && (isAuxVerb(token) || (isContinuativeForm(last_standalone_token) && isVerbSuffix(token)) || (isVerbSuffix(token) && isVerbNonIndependent(last_standalone_token)))) ||
(isNoun(last_standalone_token) && !isProperNoun(last_standalone_token) && isNounSuffix(token)) ||
(isCounter(token) && isNumeral(last_standalone_token)) ||
isBaParticle(token) || isTatteParticle(token) ||
(isTeDeParticle(token) && isContinuativeForm(last_standalone_token)) ||
isTaDaParticle(token); // Allowing more than verbs because it can be adj too, なかった
if (should_merge) {
line.pop();
last_result_token.term = last_result_token.term + token.term;
last_result_token.reading = last_result_token.reading + token.reading;
last_result_token.source = last_result_token.source + token.source;
result_token = last_result_token;
}
}
last_standalone_token = token;
line.push(result_token);
}
lines.push(line);
}
Expand Down
6 changes: 3 additions & 3 deletions ext/js/comm/yomitan-api.js
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ export class YomitanApi {
case 'tokenize': {
/** @type {import('yomitan-api.js').tokenizeInput} */
// @ts-expect-error - Allow this to error
const {text, scanLength} = parsedBody;
const {text, scanLength, parser} = parsedBody;
if (typeof text !== 'string') {
throw new Error('Invalid input for tokenize, expected "text" to be a string but got ' + typeof text);
}
Expand All @@ -250,8 +250,8 @@ export class YomitanApi {
text: text,
optionsContext: {index: optionsFull.profileCurrent},
scanLength: scanLength,
useInternalParser: true,
useMecabParser: false,
useInternalParser: parser !== 'mecab',
useMecabParser: parser === 'mecab',
};
result = await this._invoke('parseText', invokeParams);
break;
Expand Down
11 changes: 11 additions & 0 deletions types/ext/mecab.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,13 @@ export type ParseResultTermRaw = {
expression?: string;
reading?: string;
source?: string;
pos1?: string;
pos2?: string;
pos3?: string;
pos4?: string;
inflection_type?: string;
inflection_form?: string;
lemma?: string;
};

/** The resulting data from an invocation of `parseText`. */
Expand All @@ -43,4 +50,8 @@ export type ParseFragment = {
reading: string;
/** The source text. */
source: string;
/** The part of speech (major category). */
pos: string;
/** The part of speech (minor category). */
pos2: string;
};