Skip to content

Commit 9ec735c

Browse files
committed
Combine SUGGESTWF with other suggestions, don't assume it covers full squiggle
Lets us combine typos into complex errors. This is a breaking change – rules like COPY (VSTR:"<$2\ $1>" SUGGESTWF) TARGET ("<(.*)>"r &missing-space-before) IF (-1 ("<(.*)>"r)) need to be changed to COPY (VSTR:"<\ $1>"S SUGGESTWF) TARGET ("<(.*)>"r &missing-space-before) (ie. don't include the preceding cohort in the suggestion) There is a new tag DROP-PRE-BLANK for the case where you want to remove the blank before a suggestion, e.g. COPY (DROP-PRE-BLANK) TARGET (&space-before-punct-mark) "<ovddasvástadus>" "ovddasvástádus" Err/Orth-a-á N <BE-Ill-Any> Sem/Perc-emo Sg Nom <W:0.0> @<SUBJ &space-before-punct-mark #6->6 ID:6 : "<.>" "." CLB <W:0.0> <SpaceBeforePunctMark> &space-before-punct-mark DROP-PRE-BLANK #7->7 ID:7 R:LEFT:6 "." CLB <W:0.0> <SpaceBeforePunctMark> &space-before-punct-mark DROP-PRE-BLANK #7->7 ID:7 R:LEFT:6
1 parent fce43c1 commit 9ec735c

13 files changed

Lines changed: 66 additions & 88 deletions

src/suggest.cpp

Lines changed: 33 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -238,14 +238,13 @@ const Reading proc_subreading(const string& line, bool generate_all_readings) {
238238
if (tag == "COERROR") { // COERROR kept for backward-compatibility
239239
r.coerror = true;
240240
}
241-
else if (tag == "&SUGGEST" ||
242-
tag ==
243-
"SUGGEST") { // &SUGGEST kept for backward-compatibility
241+
else if (tag == "DROP-PRE-BLANK") {
242+
r.drop_pre_blank = true;
243+
}
244+
else if (tag == "&SUGGEST" || tag == "SUGGEST") { // &SUGGEST kept for backward-compatibility
244245
r.suggest = true;
245246
}
246-
else if (tag == "&SUGGESTWF" ||
247-
tag ==
248-
"SUGGESTWF") { // &SUGGESTWF kept for backward-compatibility
247+
else if (tag == "&SUGGESTWF" || tag == "SUGGESTWF") { // &SUGGESTWF kept for backward-compatibility
249248
r.suggestwf = true;
250249
}
251250
else if (result.empty()) {
@@ -261,9 +260,7 @@ const Reading proc_subreading(const string& line, bool generate_all_readings) {
261260
else if (tag == "&ADDED-BEFORE-BLANK") {
262261
r.added = AddedBeforeBlank;
263262
}
264-
else if (tag == "&LINK" ||
265-
tag ==
266-
"&COERROR") { // &LINK kept for backward-compatibility
263+
else if (tag == "&LINK" || tag == "&COERROR") { // &LINK kept for backward-compatibility
267264
r.coerror = true;
268265
}
269266
else {
@@ -350,6 +347,7 @@ const Reading proc_reading(const hfst::HfstTransducer& generator,
350347
r.sforms.insert(r.sforms.end(), sub.sforms.begin(), sub.sforms.end());
351348
r.wf = r.wf.empty() ? sub.wf : r.wf;
352349
r.fixedcase |= sub.fixedcase;
350+
r.drop_pre_blank |= sub.drop_pre_blank;
353351
}
354352
dedupe(r.rels);
355353
if (r.suggest) {
@@ -577,47 +575,30 @@ build_squiggle_replacement(const Reading& r, const ErrId& err_id,
577575
std::map<pair<size_t, size_t>, pair<u16string, Reading>>
578576
add; // position in text:cohort in Sentence
579577
// Loop from the leftmost to the rightmost of source and target cohorts:
580-
if (verbose)
581-
std::cerr << "\033[1;31m=== err_id=\t" << toUtf8(err_id)
582-
<< " ===\033[0m" << std::endl;
583-
if (verbose)
584-
std::cerr << "\033[1;33mr.id=\t" << r.id << "\033[0m" << std::endl;
585-
if (verbose)
586-
std::cerr << "\033[1;33msrc.id=\t" << src.id << "\033[0m" << std::endl;
587-
if (verbose)
588-
std::cerr << "\033[1;33mi_c=\t" << i_c << "\033[0m" << std::endl;
589-
if (verbose)
590-
std::cerr << "\033[1;33mleft=\t" << i_left << "\033[0m" << std::endl;
591-
if (verbose)
592-
std::cerr << "\033[1;33mright=\t" << i_right << "\033[0m" << std::endl;
578+
if (verbose) std::cerr << "\033[1;31m=== err_id=\t" << toUtf8(err_id) << " ===\033[0m" << std::endl;
579+
if (verbose) std::cerr << "\033[1;33mr.id=\t" << r.id << "\033[0m" << std::endl;
580+
if (verbose) if(r.drop_pre_blank) std::cerr << "\033[1;33mr.drop_pre_blank=\t" << r.drop_pre_blank << "\033[0m" << std::endl;
581+
if (verbose) std::cerr << "\033[1;33msrc.id=\t" << src.id << "\033[0m" << std::endl;
582+
if (verbose) std::cerr << "\033[1;33mi_c=\t" << i_c << "\033[0m" << std::endl;
583+
if (verbose) std::cerr << "\033[1;33mleft=\t" << i_left << "\033[0m" << std::endl;
584+
if (verbose) std::cerr << "\033[1;33mright=\t" << i_right << "\033[0m" << std::endl;
593585
UStringVector reps = { u"" };
594-
UStringVector
595-
reps_suggestwf = {}; // If we're doing SUGGESTWF, we ignore reps
596586
string prev_added_before_blank = "";
597587
std::optional<Casing> addedcasing = std::nullopt;
598588
for (size_t i = i_left; i <= i_right; ++i) {
599589
const auto& trg = sentence.cohorts[i];
600590
Casing casing = getCasing(toUtf8(trg.form));
601591

602-
if (verbose)
603-
std::cerr << "\033[1;34mi=\t" << i << "\033[0m" << std::endl;
604-
if (verbose)
605-
std::cerr << "\033[1;34mtrg.form=\t'" << toUtf8(trg.form)
606-
<< "'\033[0m" << std::endl;
607-
if (verbose)
608-
std::cerr << "\033[1;34mtrg.id=\t" << trg.id << "\033[0m"
609-
<< std::endl;
610-
if (verbose)
611-
std::cerr << "\033[1;35mtrg.raw_pre_blank=\t'" << trg.raw_pre_blank
612-
<< "'\033[0m" << std::endl;
592+
if (verbose) std::cerr << "\033[1;34mi=\t" << i << "\033[0m" << std::endl;
593+
if (verbose) std::cerr << "\033[1;34mtrg.form=\t'" << toUtf8(trg.form) << "'\033[0m" << std::endl;
594+
if (verbose) std::cerr << "\033[1;34mtrg.id=\t" << trg.id << "\033[0m" << std::endl;
595+
if (verbose) std::cerr << "\033[1;35mtrg.raw_pre_blank=\t'" << trg.raw_pre_blank << "'\033[0m" << std::endl;
613596

614597
UStringVector rep_this_trg;
615598
const bool del = do_delete(trg, err_id, src.errtypes, deletions);
616599
if (del) {
617600
rep_this_trg.push_back(u"");
618-
if (verbose)
619-
std::cerr << "\t\t\033[1;36mdelete=\t" << toUtf8(trg.form)
620-
<< "\033[0m" << std::endl;
601+
if (verbose) std::cerr << "\t\t\033[1;36mdelete=\t" << toUtf8(trg.form) << "\033[0m" << std::endl;
621602
}
622603

623604
if (trg.added) {
@@ -642,18 +623,12 @@ build_squiggle_replacement(const Reading& r, const ErrId& err_id,
642623
bool applies_deletion = trg.id == src.id && src_applies_deletion;
643624
size_t trg_beg = trg.pos;
644625
size_t trg_end = trg.pos + trg.form.size();
645-
for (const Reading& tr :
646-
readings_with_errtype(trg, err_id, applies_deletion)) {
647-
if (verbose)
648-
std::cerr << "\033[1;32mtr.line=\t" << tr.line << "\033[0m"
649-
<< std::endl;
626+
for (const Reading& tr : readings_with_errtype(trg, err_id, applies_deletion)) {
627+
if (verbose) std::cerr << "\033[1;32mtr.line=\t" << tr.line << "\033[0m" << std::endl;
650628
// Update beg/end:
651629
if (tr.added == AddedBeforeBlank) {
652630
if (i == 0) {
653-
std::cerr
654-
<< "divvun-suggest: WARNING: Saw &ADDED-BEFORE-BLANK on "
655-
"initial word, ignoring"
656-
<< std::endl;
631+
std::cerr << "divvun-suggest: WARNING: Saw &ADDED-BEFORE-BLANK on " "initial word, ignoring" << std::endl;
657632
continue;
658633
}
659634
const auto& pretrg = sentence.cohorts[i - 1];
@@ -665,40 +640,21 @@ build_squiggle_replacement(const Reading& r, const ErrId& err_id,
665640
NotAdded) { // Don't replace existing form if Added/AddedBeforeBlank
666641
trg_end = trg_beg;
667642
}
668-
if (verbose)
669-
std::cerr << "\t\033[1;35mr.wf='" << tr.wf << "'\033[0m";
670-
if (verbose)
671-
std::cerr << "\t\033[0;35mr.coerror=" << tr.coerror
672-
<< "\033[0m";
673-
if (verbose)
674-
std::cerr << "\t\033[0;35mr.suggestwf=" << tr.suggestwf
675-
<< "\033[0m";
676-
if (verbose)
677-
std::cerr << "\t\033[0;35mr.suggest=" << tr.suggest
678-
<< "\033[0m" << "\t" << tr.line;
643+
if (verbose) std::cerr << "\t\033[1;35mr.wf='" << tr.wf << "'\033[0m";
644+
if (verbose) std::cerr << "\t\033[0;35mr.coerror=" << tr.coerror << "\033[0m";
645+
if (verbose) std::cerr << "\t\033[0;35mr.suggestwf=" << tr.suggestwf << "\033[0m";
646+
if (verbose) std::cerr << "\t\033[0;35mr.suggest=" << tr.suggest << "\033[0m" << "\t" << tr.line;
679647
// Collect SUGGEST/SUGGESTWF:
680648
if (!del)
681649
for (const auto& sf : tr.sforms) {
682650
const auto cased_sf =
683651
fromUtf8(withCasing(tr.fixedcase, casing, sf));
684652
rep_this_trg.push_back(cased_sf);
685-
if (tr.suggestwf) {
686-
if (i == i_c) {
687-
reps_suggestwf.push_back(cased_sf);
688-
}
689-
else {
690-
std::cerr
691-
<< "divvun-suggest: WARNING: Saw SUGGESTWF on "
692-
"non-central (co-)cohort, ignoring"
693-
<< std::endl;
694-
}
695-
}
696-
if (verbose)
697-
std::cerr << "\t\t\033[1;36msform=\t'" << sf
698-
<< "'\033[0m" << std::endl;
653+
if (verbose) std::cerr << "\t\t\033[1;36msform=\t'" << sf << "'\033[0m" << std::endl;
699654
}
700655
fixedcase |= tr.fixedcase; // for the surface form
701656
} // end for readings of target
657+
702658
if (rep_this_trg.empty()) {
703659
const auto cased_sf =
704660
fromUtf8(withCasing(fixedcase, casing, toUtf8(trg.form)));
@@ -710,10 +666,9 @@ build_squiggle_replacement(const Reading& r, const ErrId& err_id,
710666
UStringVector reps_next;
711667
for (auto& rep : reps) {
712668
// Prepend blank unless at left edge:
713-
const auto pre_blank =
714-
i == i_left || added_before_blank ?
715-
"" :
716-
clean_blank(prev_added_before_blank + trg.raw_pre_blank);
669+
const auto pre_blank = i == i_left || added_before_blank || r.drop_pre_blank ?
670+
"" :
671+
clean_blank(prev_added_before_blank + trg.raw_pre_blank);
717672
// For &ADDED, enclose in blanks (unneeded blanks will get cleaned later):
718673
const auto post_blank = trg.added ? u" " : u"";
719674
for (const auto& sform : rep_this_trg) {
@@ -732,13 +687,8 @@ build_squiggle_replacement(const Reading& r, const ErrId& err_id,
732687
rep.erase(1 + rep.find_last_not_of(' '));
733688
rep.erase(0, rep.find_first_not_of(' '));
734689
}
735-
if (verbose)
736-
for (const auto& sf : reps) {
737-
std::cerr << "\033[1;35mreps sf=\t'" << toUtf8(sf) << "'\033[0m\t"
738-
<< beg << "," << end << std::endl;
739-
}
740-
return std::make_pair(std::make_pair(beg, end),
741-
reps_suggestwf.empty() ? reps : reps_suggestwf);
690+
if (verbose) for (const auto& sf : reps) {std::cerr << "\033[1;35mreps sf=\t'" << toUtf8(sf) << "'\033[0m\t" << beg << "," << end << std::endl;}
691+
return std::make_pair(std::make_pair(beg, end), reps);
742692
}
743693

744694
variant<Nothing, Err> Suggest::cohort_errs(const ErrId& err_id, size_t i_c,

src/suggest.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,7 @@ struct Reading {
186186
string wf; // tag of type "wordform"S for use with SUGGESTWF
187187
bool suggestwf = false;
188188
bool coerror = false; // cohorts that are not the "core" of the underline never become Err's; message template offsets refer to the cohort of the Err
189+
bool drop_pre_blank = false; // we should trim any pre-blank in suggestion
189190
Added added = NotAdded;
190191
bool fixedcase = false; // don't change casing on suggestions if we have this tag
191192
string line; // The (unchanged) input lines which created this Reading

test/suggest/errors.xml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,19 @@
108108
</body>
109109
</error>
110110

111+
<error id="msyn-demphrase-congruence-pline">
112+
<header>
113+
<title xml:lang="en">pline</title>
114+
<title xml:lang="se">pline</title>
115+
<references> <ref n=""/></references>
116+
</header>
117+
<body>
118+
<description xml:lang="en">pline</description>
119+
<description xml:lang="se">pline</description>
120+
</body>
121+
</error>
122+
123+
111124

112125
<error id="msyn-valency-acc">
113126
<header>

test/suggest/expected.suggest+suggestwf.err

Whitespace-only changes.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"errs":[["dah sajvojde",0,12,"msyn-demphrase-congruence-pline","pline",["dejnie saajvojne"],"pline"]],"text":"dah sajvojde"}

test/suggest/expected.suggestwf+suggest.err

Whitespace-only changes.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"errs":[["dah sajvojde",0,12,"msyn-demphrase-congruence-pline","pline",["dejnie saajvojne"],"pline"]],"text":"dah sajvojde"}

test/suggest/generator.strings

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,3 +38,5 @@ dát+Pron+Dem+Sg+Ill+Attr:dán
3838
ásadus+N+Sg+Gen:ásadusá
3939
dat+Pron+Dem+Sg+Gen:dan
4040
guokta+Num+Sg+Gen:guovte
41+
dïhte+Pron+Dem+Pl+Ine:dejnie
42+
dïhte+Pron+Dem+Pl+Ill:dejtie

test/suggest/input.missinglink.cg

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"Romania" N Prop Sem/Sur Sg Loc <W:0.0> <cohort-with-dynamic-compound> @N<
33
"<,>"
44
"," CLB <W:0.0> <NoSpaceAfterPunctMark> &no-space-after-punct-mark ID:3 R:RIGHT:4
5-
"," CLB <W:0.0> <NoSpaceAfterPunctMark> ", Hellasis"S &no-space-after-punct-mark SUGGESTWF ID:3 R:RIGHT:4
5+
"," CLB <W:0.0> <NoSpaceAfterPunctMark> ", "S &no-space-after-punct-mark SUGGESTWF ID:3 R:RIGHT:4
66
"<Hellasis>"
77
"Hellas" N Prop Sem/Plc Sg Loc <W:0.0> <cohort-with-dynamic-compound> @<ADVL &typo SUGGEST ID:4
88
:\n
@@ -16,7 +16,7 @@
1616
"Romania" N Prop Sem/Sur Sg Loc <W:0.0> <cohort-with-dynamic-compound> @N<
1717
"<,>"
1818
"," CLB <W:0.0> <NoSpaceAfterPunctMark> &no-space-after-punct-mark ID:7 R:RIGHT:8
19-
"," CLB <W:0.0> <NoSpaceAfterPunctMark> ", Hellasis"S &no-space-after-punct-mark SUGGESTWF ID:7 R:RIGHT:8
19+
"," CLB <W:0.0> <NoSpaceAfterPunctMark> ", "S &no-space-after-punct-mark SUGGESTWF ID:7 R:RIGHT:8
2020
"<Hellasis>"
2121
"Hellas" N Prop Sem/Plc Sg Loc <W:0.0> <cohort-with-dynamic-compound> @<ADVL &typo SUGGEST ID:8
2222
"Hellas" N Prop Sem/Sur Sg Loc <W:0.0> <cohort-with-dynamic-compound> @<ADVL &typo SUGGEST ID:8

test/suggest/input.multiple-errtags.cg

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,6 @@
33
"ovddasvástádus" N <BE-Ill-Any> Sem/Perc-emo Sg Nom <W:0.0> @<SUBJ &typo SUGGEST #6->6 ID:6
44
:
55
"<.>"
6-
"." CLB <W:0.0> <SpaceBeforePunctMark> &space-before-punct-mark #7->7 ID:7 R:LEFT:6
7-
"." CLB <W:0.0> <SpaceBeforePunctMark> "ovddasvástadus."S &space-before-punct-mark SUGGESTWF #7->7 ID:7 R:LEFT:6
6+
"." CLB <W:0.0> <SpaceBeforePunctMark> &space-before-punct-mark DROP-PRE-BLANK #7->7 ID:7 R:LEFT:6
7+
"." CLB <W:0.0> <SpaceBeforePunctMark> "."S &space-before-punct-mark DROP-PRE-BLANK SUGGESTWF #7->7 ID:7 R:LEFT:6
88
:\n

0 commit comments

Comments
 (0)