Skip to content

Commit 9046a81

Browse files
committed
do normalising per tag like phons
see giellalt/lang-sme#528
1 parent 2901416 commit 9046a81

6 files changed

Lines changed: 125 additions & 114 deletions

File tree

src/main_normaliser.cpp

Lines changed: 25 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -29,21 +29,21 @@ int main(int argc, char** argv) {
2929
argv[0], "BIN - use FSTs to normalise and expand text for TTS");
3030

3131
options.add_options()("a,surface-analyser", "FST for surface analysis",
32-
cxxopts::value<std::string>(), "BIN")("n,normaliser",
33-
"FST for normalisation", cxxopts::value<std::string>(),
32+
cxxopts::value<std::string>(),
33+
"BIN")("n,normalisers", "FSTs for normalisation per tag: TAG=ABIN",
34+
cxxopts::value<std::vector<std::string>>(),
3435
"BIN")("d,deep-analyser", "FST for deep analysis (UNIMPLEMENTED)",
3536
cxxopts::value<std::string>(),
3637
"BIN")("i,input", "Input file (UNIMPLEMENTED, stdin for now)",
3738
cxxopts::value<std::string>(),
3839
"FILE")("o,output", "Output file (UNIMPLEMENTED, stdout for now)",
3940
cxxopts::value<std::string>(), "FILE")("g,generator",
4041
"FST for generations", cxxopts::value<std::string>(),
41-
"BIN")("t,tags", "limit tags to expand",
42-
cxxopts::value<std::vector<std::string>>(), "TAGS")("v,verbose",
43-
"Be verbose")("D,debug", "Be debugsy")("T,trace", "Be tracy")(
44-
"V,version", "Version information")("h,help", "Print help");
42+
"BIN")("v,verbose", "Be verbose")("D,debug", "Be debugsy")(
43+
"T,trace", "Be tracy")("V,version", "Version information")(
44+
"h,help", "Print help");
4545

46-
std::vector<std::string> pos = { "normaliser", "input", "output" };
46+
std::vector<std::string> pos = { "normalisers", "input", "output" };
4747
options.parse_positional(pos);
4848
options.parse(argc, argv);
4949

@@ -65,9 +65,10 @@ int main(int argc, char** argv) {
6565
return (EXIT_SUCCESS);
6666
}
6767

68-
if (!options.count("normaliser")) {
68+
if (!options.count("normalisers")) {
6969
std::cout << options.help({ "" }) << std::endl;
70-
std::cerr << argv[0] << " ERROR: expected --normaliser option"
70+
std::cerr << argv[0]
71+
<< " ERROR: expected one or more --normalisers option"
7172
<< std::endl;
7273
return (EXIT_FAILURE);
7374
}
@@ -84,11 +85,6 @@ int main(int argc, char** argv) {
8485
<< std::endl;
8586
return (EXIT_FAILURE);
8687
}
87-
if (!options.count("tags")) {
88-
std::cerr << argv[0]
89-
<< " WARNING: expected at least one --tags option."
90-
<< std::endl;
91-
}
9288
const auto& verbose = options.count("verbose");
9389
const auto& debug = options.count("debug");
9490
const auto& trace = options.count("trace");
@@ -105,10 +101,6 @@ int main(int argc, char** argv) {
105101
if (verbose) {
106102
std::cout << "Surface analyser set to: " << sanalyser << std::endl;
107103
}
108-
const auto& normaliserfile = options["normaliser"].as<std::string>();
109-
if (verbose) {
110-
std::cout << "Normaliser set to: " << normaliserfile << std::endl;
111-
}
112104
const auto& generator = options["generator"].as<std::string>();
113105
if (verbose) {
114106
std::cout << "Generator set to: " << generator << std::endl;
@@ -117,16 +109,23 @@ int main(int argc, char** argv) {
117109
if (verbose) {
118110
std::cout << "Deep analyser set to: " << danalyser << std::endl;
119111
}
120-
const auto& tags = options["tags"].as<std::vector<std::string>>();
121-
if (verbose) {
122-
std::cout << "Tags set to: ";
123-
for (auto tag : tags) {
124-
std::cout << tag << " ";
112+
auto normaliser = divvun::Normaliser(
113+
generator, sanalyser, danalyser, verbose, trace, debug);
114+
for (const auto& tag2fsa :
115+
options["normalisers"].as<std::vector<std::string>>()) {
116+
auto eqpos = tag2fsa.find("=");
117+
if (eqpos == string::npos) {
118+
std::cerr << "missing = in " << tag2fsa << std::endl;
119+
return EXIT_FAILURE;
120+
}
121+
auto tag = tag2fsa.substr(0, eqpos);
122+
auto fsa = tag2fsa.substr(eqpos + 1);
123+
if (verbose) {
124+
std::cout << "Nrormaliser for tag ’" << tag << "’ set to "
125+
<< fsa << std::endl;
125126
}
126-
std::cout << std::endl;
127+
normaliser.addNormaliser(tag, fsa);
127128
}
128-
auto normaliser = divvun::Normaliser(normaliserfile, generator,
129-
sanalyser, danalyser, tags, verbose, trace, debug);
130129
normaliser.run(std::cin, std::cout);
131130
}
132131
catch (const cxxopts::OptionException& e) {

src/normaliser.cpp

Lines changed: 44 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -19,40 +19,31 @@
1919

2020
namespace divvun {
2121

22-
Normaliser::Normaliser(const hfst::HfstTransducer* normaliser_,
23-
const hfst::HfstTransducer* generator_,
22+
Normaliser::Normaliser(const hfst::HfstTransducer* generator_,
2423
const hfst::HfstTransducer* sanalyser_,
25-
const hfst::HfstTransducer* danalyser_, const vector<string>& tags_,
26-
bool verbose_, bool trace_, bool debug_)
27-
: normaliser(normaliser_)
28-
, generator(generator_)
24+
const hfst::HfstTransducer* danalyser_, bool verbose_, bool trace_,
25+
bool debug_)
26+
: generator(generator_)
2927
, sanalyser(sanalyser_)
3028
, danalyser(danalyser_)
31-
, tags(tags_)
3229
, verbose(verbose_)
3330
, trace(trace_)
3431
, debug(debug_) {}
3532

36-
Normaliser::Normaliser(const string& normaliser_, const string& generator_,
37-
const string& sanalyser_, const string& danalyser_,
38-
const vector<string>& tags_, bool verbose_, bool trace_, bool debug_) {
33+
Normaliser::Normaliser(const string& generator_, const string& sanalyser_,
34+
const string& danalyser_, bool verbose_, bool trace_, bool debug_) {
3935
debug = debug_;
4036
verbose = verbose_;
4137
trace = trace_;
4238
if (verbose_) {
4339
std::cout << "Reading files: " << std::endl;
44-
std::cout << "* " << normaliser_ << std::endl;
4540
if (trace_) {
4641
std::cout << "Printing traces" << std::endl;
4742
}
4843
if (debug_) {
4944
std::cout << "Printing debugs" << std::endl;
5045
}
5146
}
52-
if (normaliser_ != "") {
53-
normaliser = std::unique_ptr<const hfst::HfstTransducer>(
54-
(readTransducer(normaliser_)));
55-
}
5647
if (verbose_) {
5748
std::cout << "* " << generator_ << std::endl;
5849
}
@@ -74,17 +65,28 @@ Normaliser::Normaliser(const string& normaliser_, const string& generator_,
7465
danalyser = std::unique_ptr<const hfst::HfstTransducer>(
7566
(readTransducer(danalyser_)));
7667
}
77-
if (verbose_) {
78-
std::cout << "expanding tags: ";
79-
for (auto tag : tags_) {
80-
std::cout << tag << " ";
81-
}
82-
std::cout << std::endl;
83-
}
84-
tags = tags_;
8568
verbose = verbose_;
8669
}
8770

71+
void Normaliser::addNormaliser(
72+
const std::string& tag, const hfst::HfstTransducer* nromaliser_) {
73+
if (verbose) {
74+
std::cout << "adding HFST transducer for tag " << tag << std::endl;
75+
}
76+
normalisers[tag] =
77+
std::unique_ptr<const hfst::HfstTransducer>(nromaliser_);
78+
}
79+
80+
void Normaliser::addNormaliser(
81+
const std::string& tag, const std::string& normaliser_) {
82+
if (verbose) {
83+
std::cout << "REading " << normaliser_ << " for tag " << tag
84+
<< std::endl;
85+
}
86+
normalisers[tag] =
87+
std::unique_ptr<const hfst::HfstTransducer>(readTransducer(normaliser_));
88+
}
89+
8890
void Normaliser::mangle_reading(CGReading& reading, std::ostream& os) {
8991
string outstring = string(reading.reading);
9092
string surf = ""; // XXX
@@ -94,18 +96,15 @@ void Normaliser::mangle_reading(CGReading& reading, std::ostream& os) {
9496
auto tabend = outstring.find("\"");
9597
auto tabs = outstring.substr(tabstart, tabend);
9698
bool everythinghasfailed = true;
97-
if (tags.empty()) {
98-
everythinghasfailed = false;
99-
//os << outstring << std::endl;
100-
}
101-
bool expand = false;
99+
std::string expandtag;
102100
bool expandmain = false;
103-
for (auto tag : tags) {
104-
if (outstring.find(tag) != std::string::npos) {
101+
for (auto& normaliser : normalisers) {
102+
if (outstring.find(normaliser.first) != std::string::npos) {
105103
if (debug) {
106-
std::cout << "Expanding because of " << tag << std::endl;
104+
std::cout << "Expanding because of " << normaliser.first
105+
<< std::endl;
107106
}
108-
expand = true;
107+
expandtag = normaliser.first;
109108
}
110109
}
111110
if (reading.subreading != nullptr) {
@@ -153,12 +152,14 @@ void Normaliser::mangle_reading(CGReading& reading, std::ostream& os) {
153152
std::cout << "Using lemma: " << surf << std::endl;
154153
}
155154
}
156-
if (expand) {
155+
if (!expandtag.empty()) {
157156
// 1. apply expansions from normaliser
158157
if (debug) {
159-
std::cout << "1. looking up normaliser for " << surf << std::endl;
158+
std::cout << "1. looking up " << expandtag << " normaliser for "
159+
<< surf << std::endl;
160160
}
161-
const HfstPaths1L expansions(normaliser->lookup_fd(surf, -1, 2.0));
161+
const HfstPaths1L expansions(
162+
normalisers[expandtag]->lookup_fd(surf, -1, 2.0));
162163
if (expansions->empty()) {
163164
if (debug) {
164165
std::cout << "Normaliser results empty." << std::endl;
@@ -258,11 +259,11 @@ void Normaliser::mangle_reading(CGReading& reading, std::ostream& os) {
258259
p = s.find(r);
259260
}
260261
}
261-
for (auto tag : tags) {
262-
p = s.find("+" + tag);
262+
for (auto& normaliser : normalisers) {
263+
p = s.find("+" + normaliser.first);
263264
while (p != std::string::npos) {
264-
s.replace(p, tag.length() + 1, "");
265-
p = s.find(tag);
265+
s.replace(p, normaliser.first.length() + 1, "");
266+
p = s.find(normaliser.first);
266267
}
267268
}
268269
regentags = s;
@@ -490,11 +491,11 @@ void Normaliser::mangle_reading(CGReading& reading, std::ostream& os) {
490491
p = s.find(r);
491492
}
492493
}
493-
for (auto tag : tags) {
494-
p = s.find("+" + tag);
494+
for (auto& normaliser : normalisers) {
495+
p = s.find("+" + normaliser.first);
495496
while (p != std::string::npos) {
496-
s.replace(p, tag.length() + 1, "");
497-
p = s.find(tag);
497+
s.replace(p, normaliser.first.length() + 1, "");
498+
p = s.find(normaliser.first);
498499
}
499500
}
500501
regentags = s;

src/normaliser.hpp

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -41,26 +41,27 @@ using std::vector;
4141

4242
class Normaliser {
4343
public:
44-
Normaliser(const hfst::HfstTransducer* normaliser,
45-
const hfst::HfstTransducer* generator,
44+
Normaliser(const hfst::HfstTransducer* generator,
4645
const hfst::HfstTransducer* sanalyser,
47-
const hfst::HfstTransducer* danalyser, const vector<string>& tags,
48-
bool verbose, bool trace, bool debug);
49-
Normaliser(const string& normaliser, const string& generator,
50-
const string& sanalyser, const string& danalyser,
51-
const vector<string>& tags, bool verbose, bool trace, bool debug);
46+
const hfst::HfstTransducer* danalyser, bool verbose, bool trace,
47+
bool debug);
48+
Normaliser(const string& generator, const string& sanalyser,
49+
const string& danalyser, bool verbose, bool trace, bool debug);
50+
void addNormaliser(
51+
const std::string& tag, const hfst::HfstTransducer* normaliser);
52+
void addNormaliser(const std::string& tag, const std::string& normaliser);
5253
/*const*/ void run(std::istream& is, std::ostream& os);
5354

5455
private:
5556
void process_cohort(CGCohort& cohort, std::ostream& os);
5657
void process_reading(CGReading& reading, std::ostream& os);
5758
std::string process_subreading(CGReading& subreading, std::ostream& os);
5859
void mangle_reading(CGReading& reading, std::ostream& os);
59-
unique_ptr<const hfst::HfstTransducer> normaliser;
60+
std::map<std::string, std::unique_ptr<const hfst::HfstTransducer>>
61+
normalisers;
6062
unique_ptr<const hfst::HfstTransducer> generator;
6163
unique_ptr<const hfst::HfstTransducer> sanalyser;
6264
unique_ptr<const hfst::HfstTransducer> danalyser;
63-
vector<string> tags;
6465
bool verbose;
6566
bool trace;
6667
bool debug;

src/pipeline.cpp

Lines changed: 34 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -58,15 +58,24 @@ void MweSplitCmd::run(stringstream& input, stringstream& output) const {
5858
applicator.get(), (std_istream*)&input, (std_ostream*)&output);
5959
}
6060

61-
NormaliseCmd::NormaliseCmd(const hfst::HfstTransducer* normaliser_,
62-
const hfst::HfstTransducer* generator, const hfst::HfstTransducer* analyser,
63-
const vector<string>& tags, bool verbose)
64-
: normaliser(new divvun::Normaliser(
65-
normaliser_, generator, analyser, NULL, tags, verbose, false, false)) {}
66-
NormaliseCmd::NormaliseCmd(const string& normaliser_, const string& generator,
67-
const string& analyser, const vector<string>& tags, bool verbose)
61+
NormaliseCmd::NormaliseCmd(const hfst::HfstTransducer* generator,
62+
const hfst::HfstTransducer* analyser,
63+
const std::map<string, const hfst::HfstTransducer*>& normalisers,
64+
bool verbose)
6865
: normaliser(new divvun::Normaliser(
69-
normaliser_, generator, analyser, "", tags, verbose, false, false)) {}
66+
generator, analyser, NULL, verbose, false, false)) {
67+
for (const auto& normaliserfsa : normalisers) {
68+
normaliser->addNormaliser(normaliserfsa.first, normaliserfsa.second);
69+
}
70+
}
71+
NormaliseCmd::NormaliseCmd(const string& generator, const string& analyser,
72+
const std::map<string, string>& normalisers, bool verbose)
73+
: normaliser(
74+
new divvun::Normaliser(generator, analyser, "", verbose, false, false)) {
75+
for (const auto& normaliserpath : normalisers) {
76+
normaliser->addNormaliser(normaliserpath.first, normaliserpath.second);
77+
}
78+
}
7079

7180
void NormaliseCmd::run(stringstream& input, stringstream& output) const {
7281
normaliser->run(input, output);
@@ -343,16 +352,17 @@ Pipeline Pipeline::mkPipeline(const unique_ptr<ArPipeSpec>& ar_spec,
343352
std::istream is(&osrb);
344353
return readTransducer(is);
345354
};
346-
auto tags = std::vector<std::string>();
347-
const pugi::xml_node& tags_element = cmd.child("tags");
348-
for (const pugi::xml_node& tag : tags_element.children()) {
349-
tags.push_back(tag.attribute("n").value());
355+
map<string, const hfst::HfstTransducer*> normalisers;
356+
auto normalisertags = cmd.children("normaliser");
357+
for (const auto& normalisertag : normalisertags) {
358+
normalisers[normalisertag.attribute("s").as_string()] =
359+
readArchiveExtract(ar_spec->ar_path,
360+
normalisertag.attribute("n").as_string(), f);
350361
}
351362
auto* s = new NormaliseCmd(
352-
readArchiveExtract(ar_spec->ar_path, args["normaliser"], f),
353363
readArchiveExtract(ar_spec->ar_path, args["generator"], f),
354-
readArchiveExtract(ar_spec->ar_path, args["analyser"], f), tags,
355-
verbose);
364+
readArchiveExtract(ar_spec->ar_path, args["analyser"], f),
365+
normalisers, verbose);
356366
cmds.emplace_back(s);
357367
}
358368
else if (name == u"blanktag") {
@@ -451,20 +461,21 @@ Pipeline Pipeline::mkPipeline(const unique_ptr<PipeSpec>& spec,
451461
#endif
452462
}
453463
else if ((name == u"normalise") || (name == u"normalize")) {
454-
auto tags = std::vector<std::string>();
455-
const pugi::xml_node& tags_element = cmd.child("tags");
456-
for (const pugi::xml_node& tag : tags_element.children()) {
457-
tags.push_back(tag.attribute("n").value());
464+
map<string, string> normalisers;
465+
auto normalisertags = cmd.children("normaliser");
466+
for (const auto& normalisertag : normalisertags) {
467+
normalisers[normalisertag.attribute("s").as_string()] =
468+
normalisertag.attribute("n").as_string();
458469
}
459-
cmds.emplace_back(new NormaliseCmd(args["normaliser"],
460-
args["generator"], args["analyser"], tags, verbose));
470+
cmds.emplace_back(new NormaliseCmd(
471+
args["generator"], args["analyser"], normalisers, verbose));
461472
}
462473
else if (name == u"phon") {
463474
map<string, string> altfsas;
464475
auto alttags = cmd.children("alttext2ipa");
465476
for (const auto& alttag : alttags) {
466-
altfsas[alttag.attribute("n").as_string()] =
467-
alttag.attribute("s").as_string();
477+
altfsas[alttag.attribute("s").as_string()] =
478+
alttag.attribute("n").as_string();
468479
}
469480
cmds.emplace_back(
470481
new PhonCmd(args["text2ipa"], altfsas, verbose, trace));

src/pipeline.hpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -153,11 +153,11 @@ class NormaliseCmd : public PipeCmd {
153153
// Normaliser(const string& normaliser, const string& generator,
154154
// const string& sanalyser, const string& danalyser,
155155
// const vector<string>& tags, bool verbose);
156-
explicit NormaliseCmd(const string& normaliser, const string& generator,
157-
const string& analyser, const vector<string>& tags, bool verbose);
158-
NormaliseCmd(const hfst::HfstTransducer* normaliser,
159-
const hfst::HfstTransducer* generator,
160-
const hfst::HfstTransducer* analyser, const vector<string>& tags,
156+
explicit NormaliseCmd(const string& generator, const string& analyser,
157+
const map<string, string>& normalisers, bool verbose);
158+
NormaliseCmd(const hfst::HfstTransducer* generator,
159+
const hfst::HfstTransducer* analyser,
160+
const map<string, const hfst::HfstTransducer*>& normalisers,
161161
bool verbose);
162162
void run(stringstream& input, stringstream& output) const override;
163163
~NormaliseCmd() override = default;

0 commit comments

Comments
 (0)