yomidevs · JSchoreels · Nov 21, 2025 · Nov 23, 2025 · Nov 30, 2025 · Nov 30, 2025
@@ -1740,7 +1740,6 @@ export class Backend {
                     }
                     result.push(termParts);
                 }
-                result.push([{text: '\n', reading: ''}]);
             }
             results.push([name, result]);
         }

@@ -206,15 +206,101 @@ export class Mecab {
         /** @type {import('mecab').ParseResult[]} */
         const results = [];
         for (const [name, rawLines] of Object.entries(rawResults)) {
+            // Define helper functions based on dictionary type
+            let ignoreReading, isNoun, isProperNoun, isCopula, isAuxVerb, isContinuativeForm, isVerbSuffix, isTatteParticle, isBaParticle, isTeDeParticle, isTaDaParticle, isVerb, isVerbNonIndependent, isNounSuffix, isCounter, isNumeral;
+
+            if (name === 'unidic-mecab-translate') {
+                // Helper functions for unidic-mecab-translate
+                ignoreReading = (tok) => tok.pos1 === 'symbol' && tok.pos2 === 'character';
+                isNoun = (tok) => tok.pos1 === 'noun';
+                isCopula = (tok) => tok.inflection_type === 'aux|da' || tok.inflection_type === 'aux|desu';
+                isAuxVerb = (tok) => (tok.pos1 === 'aux' || tok.pos1 === 'aux-verb') && !isCopula(tok);
+                isContinuativeForm = (tok) => (tok.inflection_form.startsWith('continuative'));
+                isVerbSuffix = (tok) => tok.pos1 === 'suffix';
+                isTatteParticle = (tok) => tok.pos1 === 'particle' && tok.pos2 === 'conjunctive' && (tok.lemma === 'たって');
+                isBaParticle = (tok) => tok.pos1 === 'particle' && tok.pos2 === 'conjunctive' && (tok.term === 'ば');
+                isTeDeParticle = (tok) => tok.pos1 === 'particle' && tok.pos2 === 'conjunctive' && tok.lemma === 'て';
+                isTaDaParticle = (tok) => isAuxVerb(tok) && (tok.term === 'た' || tok.term === 'だ');
+                isVerb = (tok) => tok.pos1 === 'verb' || (tok.pos1 === 'aux' || tok.pos1 === 'aux-verb');
+                isVerbNonIndependent = (tok) => isVerb(tok) && tok.pos2 === 'nonindependent?';
+                isProperNoun = (tok) => tok.pos1 === 'noun' && tok.pos2 === 'proper';
+                isNounSuffix = (tok) => tok.pos1 === 'suffix' && tok.pos2 === 'substantive';
+                isCounter = (tok) => tok.pos1 === 'noun' && tok.pos2 === 'common' && tok.pos3 === 'counter?';
+                isNumeral = (tok) => tok.pos1 === 'noun' && tok.pos2 === 'numeral';
+            } else {
+                // Helper functions for ipadic and other dictionaries
+                ignoreReading = (tok) => tok.pos1 === '記号' && tok.pos2 === '文字';
+                isNoun = (tok) => tok.pos1 === '名詞';
+                const isCopulaIpadic = (tok) => tok.inflection_type === '特殊|だ' || tok.inflection_type === '特殊|デス';
+                const isCopulaUnidic = (tok) => tok.inflection_type === '助動詞-ダ' || tok.inflection_type === '助動詞-デス';
+                isCopula = (tok) => isCopulaIpadic(tok) || isCopulaUnidic(tok);
+                isAuxVerb = (tok) => tok.pos1 === '助動詞' && !isCopula(tok);
+                isContinuativeForm = (tok) => (tok.inflection_form === '連用デ接続' || tok.inflection_form === '連用タ接続' || tok.inflection_form.startsWith('連用形')) && (tok.reading !== 'ない');
+                // 待ってるじゃないです : てる is 動詞,非自立,*,*,一段,基本形,てる,テル,テル
+                // やられる : れる is 動詞,接尾,*,*,一段,基本形,れる,レル,レル
+                const isVerbSuffixIpadic = (tok) => tok.pos1 === '動詞' && (tok.pos2 === '非自立' || tok.pos2 === '接尾');
+                const isVerbSuffixUnidic = (tok) => tok.pos1 === '接尾辞' && (tok.pos2 === '形容詞的');
+                isVerbSuffix = (tok) => isVerbSuffixUnidic(tok) || isVerbSuffixIpadic(tok);
+                isTatteParticle = (tok) => tok.pos1 === '助詞' && tok.pos2 === '接続助詞' && (tok.lemma === 'たって');
+                isBaParticle = (tok) => tok.pos1 === '助詞' && tok.pos2 === '接続助詞' && (tok.term === 'ば');
+                isTeDeParticle = (tok) => tok.pos1 === '助詞' && tok.pos2 === '接続助詞' && (tok.term === 'て' || tok.term === 'で' || tok.term === 'ちゃ'); // cha doesn't have a lemma in ipadic
+                isTaDaParticle = (tok) => isAuxVerb(tok) && (tok.term === 'た' || tok.term === 'だ');
+                isVerb = (tok) => tok.pos1 === '動詞' || tok.pos1 === '助動詞';
+                isVerbNonIndependent = (_) => true;
+                isProperNoun = (tok) => tok.pos1 === '名詞' && tok.pos2 === '固有名詞';
+                const isNounSuffixIpadic = (tok) => tok.pos1 === '動詞' && tok.pos2 === '接尾';
+                const isNounSuffixUnidic = (tok) => tok.pos1 === '接尾辞' && tok.pos2 === '名詞的';
+                isNounSuffix = (tok) => isNounSuffixIpadic(tok) || isNounSuffixUnidic(tok);
+                isCounter = (tok) => tok.pos1 === '名詞' && tok.pos3.startsWith('助数詞');
+                isNumeral = (tok) => tok.pos1 === '名詞' && tok.pos2.startsWith('数');
+            }
+
             /** @type {import('mecab').ParseFragment[][]} */
             const lines = [];
+            let last_standalone_token;
+
             for (const rawLine of rawLines) {
+                /** @type {import('mecab').ParseFragment[]} */
                 const line = [];
-                for (let {expression: term, reading, source} of rawLine) {
+
+                for (let {expression: term, reading, source, pos1, pos2, pos3, pos4, inflection_type, inflection_form, lemma} of rawLine) {
                     if (typeof term !== 'string') { term = ''; }
-                    if (typeof reading !== 'string') { reading = ''; }
+                    if ((typeof reading !== 'string')) { reading = ''; }
                     if (typeof source !== 'string') { source = ''; }
-                    line.push({term, reading, source});
+                    if (typeof pos1 !== 'string') { pos1 = ''; }
+                    if (typeof pos2 !== 'string') { pos2 = ''; }
+                    if (typeof pos3 !== 'string') { pos3 = ''; }
+                    if (typeof pos4 !== 'string') { pos4 = ''; }
+                    if (typeof inflection_type !== 'string') { inflection_type = ''; }
+                    if (typeof lemma !== 'string') { lemma = ''; }
+
+                    const token = {term, reading, source, pos1, pos2, pos3, pos4, inflection_type, inflection_form, lemma};
+
+                    if (ignoreReading(token)) {
+                        token.reading = '';
+                    }
+
+                    let result_token = token;
+                    let should_merge;
+                    if (line.length > 0) {
+                        const last_result_token = line[line.length - 1];
+                        // eslint-disable-next-line @typescript-eslint/no-unsafe-assignment
+                        should_merge = (isVerb(last_standalone_token) && (isAuxVerb(token) || (isContinuativeForm(last_standalone_token) && isVerbSuffix(token)) || (isVerbSuffix(token) && isVerbNonIndependent(last_standalone_token)))) ||
+                        (isNoun(last_standalone_token) && !isProperNoun(last_standalone_token) && isNounSuffix(token)) ||
+                        (isCounter(token) && isNumeral(last_standalone_token)) ||
+                        isBaParticle(token) || isTatteParticle(token) ||
+                        (isTeDeParticle(token) && isContinuativeForm(last_standalone_token)) ||
+                        isTaDaParticle(token); // Allowing more than verbs because it can be adj too, なかった
+                        if (should_merge) {
+                            line.pop();
+                            last_result_token.term = last_result_token.term + token.term;
+                            last_result_token.reading = last_result_token.reading + token.reading;
+                            last_result_token.source = last_result_token.source + token.source;
+                            result_token = last_result_token;
+                        }
+                    }
+                    last_standalone_token = token;
+                    line.push(result_token);
                 }
                 lines.push(line);
             }

@@ -239,7 +239,7 @@ export class YomitanApi {
                     case 'tokenize': {
                         /** @type {import('yomitan-api.js').tokenizeInput} */
                         // @ts-expect-error - Allow this to error
-                        const {text, scanLength} = parsedBody;
+                        const {text, scanLength, parser} = parsedBody;
                         if (typeof text !== 'string') {
                             throw new Error('Invalid input for tokenize, expected "text" to be a string but got ' + typeof text);
                         }
@@ -250,8 +250,8 @@ export class YomitanApi {
                             text: text,
                             optionsContext: {index: optionsFull.profileCurrent},
                             scanLength: scanLength,
-                            useInternalParser: true,
-                            useMecabParser: false,
+                            useInternalParser: parser !== 'mecab',
+                            useMecabParser: parser === 'mecab',
                         };
                         result = await this._invoke('parseText', invokeParams);
                         break;

@@ -25,6 +25,13 @@ export type ParseResultTermRaw = {
     expression?: string;
     reading?: string;
     source?: string;
+    pos1?: string;
+    pos2?: string;
+    pos3?: string;
+    pos4?: string;
+    inflection_type?: string;
+    inflection_form?: string;
+    lemma?: string;
 };
 
 /** The resulting data from an invocation of `parseText`. */
@@ -43,4 +50,8 @@ export type ParseFragment = {
     reading: string;
     /** The source text. */
     source: string;
+    /** The part of speech (major category). */
+    pos: string;
+    /** The part of speech (minor category). */
+    pos2: string;
 };
-Original file line number
+Diff line change
@@ Expand Up / @@ -1740,7 +1740,6 @@ export class Backend { @@
                         }
                         result.push(termParts);
                     }
-                    result.push([{text: '\n', reading: ''}]);
                 }
                 results.push([name, result]);
             }
@@ Expand Down @@