Skip to content

Commit abd2efa

Browse files
authored
Merge pull request #2911 from fernewelten/NC_Unicode_Char
New Compiler: Accept UTF-8 within `'…'`
2 parents e463deb + 9e5b029 commit abd2efa

File tree

6 files changed

+237
-110
lines changed

6 files changed

+237
-110
lines changed

Compiler/script2/cs_parser.cpp

Lines changed: 11 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -7722,21 +7722,6 @@ void AGS::Parser::Parse()
77227722
}
77237723
}
77247724

7725-
// Scan inpl into scan tokens, build a symbol table
7726-
int cc_scan(std::string const &inpl, AGS::SrcList &src, AGS::ccCompiledScript &scrip, AGS::SymbolTable &symt, AGS::MessageHandler &mh)
7727-
{
7728-
AGS::Scanner scanner = { inpl, src, scrip, symt, mh };
7729-
scanner.Scan();
7730-
return -static_cast<int>(mh.HasError());
7731-
}
7732-
7733-
int cc_parse(AGS::SrcList &src, AGS::FlagSet options, AGS::ccCompiledScript &scrip, AGS::SymbolTable &symt, AGS::MessageHandler &mh)
7734-
{
7735-
AGS::Parser parser = { src, options, scrip, symt, mh };
7736-
parser.Parse();
7737-
return -static_cast<int>(mh.HasError());
7738-
}
7739-
77407725
int cc_compile(std::string const &inpl, AGS::FlagSet options, AGS::ccCompiledScript &scrip, AGS::MessageHandler &mh)
77417726
{
77427727
AGS::SymbolTable local_symt;
@@ -7754,9 +7739,16 @@ int cc_compile(std::string const &inpl, AGS::FlagSet options, AGS::ccCompiledScr
77547739
src.NewSection("UnnamedSection");
77557740
src.NewLine(1u);
77567741

7757-
int error_code = cc_scan(inpl, src, scrip, symt, mh);
7758-
if (error_code >= 0)
7759-
error_code = cc_parse(src, options, scrip, symt, mh);
7742+
AGS::Scanner scanner = { inpl, FlagIsSet(options, SCOPT_UTF8), src, scrip, symt, mh };
7743+
scanner.Scan();
7744+
if (mh.HasError())
7745+
return -1;
7746+
7747+
AGS::Parser parser = { src, options, scrip, symt, mh };
7748+
parser.Parse();
7749+
if (mh.HasError())
7750+
return -2;
7751+
77607752
sections = lh.CreateSectionList();
7761-
return error_code;
7753+
return 0;
77627754
}

Compiler/script2/cs_scanner.cpp

Lines changed: 94 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,9 @@
2626
std::string const AGS::Scanner::kNewSectionLitPrefix = "__NEWSCRIPTSTART_";
2727
size_t const AGS::Scanner::kNewSectionLitPrefixSize = 17u;
2828

29-
AGS::Scanner::Scanner(std::string const &input, SrcList &token_list, ccCompiledScript &string_collector, SymbolTable &symt, MessageHandler &messageHandler)
30-
: _ocMatcher(*this)
29+
AGS::Scanner::Scanner(std::string const &input, bool const use_utf8, SrcList &token_list, ccCompiledScript &string_collector, SymbolTable &symt, MessageHandler &messageHandler)
30+
: _useUtf8(use_utf8)
31+
, _ocMatcher(*this)
3132
, _lineno(1u)
3233
, _tokenList(token_list)
3334
, _msgHandler(messageHandler)
@@ -345,7 +346,7 @@ void AGS::Scanner::ReadInNumberLit(std::string &symstring, ScanType &scan_type,
345346

346347
errno = 0;
347348
char *endptr;
348-
std::strtof(valstring.c_str(), &endptr);
349+
(void) std::strtof(valstring.c_str(), &endptr);
349350
bool const can_be_a_floating_point =
350351
(errno == 0 || errno == ERANGE) && // range errors will be treated below
351352
(valstring.length() == endptr - valstring.c_str()); // ensure that all chars are used up in the conversion
@@ -459,6 +460,11 @@ void AGS::Scanner::ReadInCharLit(std::string &symstring, CodeCell &value)
459460

460461
EscapedChar2Char(lit_char, symstring, lit_char);
461462
}
463+
else if (UseUtf8() && (lit_char < 0 || lit_char > 0x7f))
464+
{
465+
// Accept an UTF-8 sequence, use the codepoint that corresponds to the sequence
466+
ReadInCharLit_utf8(lit_char, symstring, lit_char);
467+
}
462468

463469
// Closing '\''
464470
int const closer = Get();
@@ -558,6 +564,91 @@ void AGS::Scanner::EscapedChar2Char(int first_char_after_backslash, std::string
558564
UserError("Unrecognized '\\%c' in character or string literal", first_char_after_backslash);
559565
}
560566

567+
void AGS::Scanner::ReadInCharLit_utf8(int const first_char, std::string &symstring, int &converted)
568+
{
569+
unsigned int const char0 = first_char & 0xff;
570+
if (char0 <= 0x7f)
571+
{
572+
// No UTF-8 sequence
573+
symstring.push_back(first_char);
574+
converted = first_char;
575+
return;
576+
}
577+
578+
if (char0 < 0xc2 || char0 > 0xf4)
579+
UserError("After the quote mark, found the byte '0x%02X'; "
580+
"a legal UTF-8 sequence cannot start this way (file corrupt?)", char0);
581+
582+
do // exactly 1 time
583+
{
584+
int next_char = Get();
585+
if (EOFReached() || Failed())
586+
break; // to error msg
587+
symstring.push_back(next_char);
588+
unsigned int const char1 = next_char & 0xff;
589+
590+
// Strictly going by the algo, you can encode some characters
591+
// in different ways, but the standard doesn't allow this:
592+
// You must always use the shortest encoding possible.
593+
// Thus, some combinations of first and second byte are forbidden.
594+
if (char1 < 0x80 ||
595+
char1 > 0xbf ||
596+
char0 == 0xe0 && char1 <= 0x9f ||
597+
char0 == 0xed && char1 >= 0xa0 ||
598+
char0 == 0xf0 && char1 <= 0x8f ||
599+
char0 == 0xf4 && char1 >= 0x90)
600+
UserError("After the quote mark, found the bytes '0x%02X%02X'; "
601+
"a legal UTF-8 sequence cannot start this way (file corrupt?)", char0, char1);
602+
603+
if (char0 <= 0xdf)
604+
{
605+
// 2-byte sequence
606+
converted = ((char0 & 0x1f) << 6) + (char1 & 0x3f);
607+
return;
608+
}
609+
610+
next_char = Get();
611+
if (EOFReached() || Failed())
612+
break; // to error msg
613+
unsigned int const char2 = next_char & 0xff;
614+
if (char2 < 0x80 || char2 > 0xbf)
615+
UserError("After the quote mark, found the bytes '0x%02X%02X%02X'; "
616+
"a legal UTF-8 sequence cannot start this way (file corrupt?)", char0, char1, char2);
617+
symstring.push_back(next_char);
618+
619+
if (char0 <= 0xef)
620+
{
621+
// 3-byte sequence
622+
converted = ((char0 & 0x0F) << 12) |
623+
((char1 & 0x3F) << 6) |
624+
(char2 & 0x3F);
625+
return;
626+
}
627+
628+
// 4-byte sequence
629+
next_char = Get();
630+
if (EOFReached() || Failed())
631+
break; // to error msg
632+
unsigned int const char3 = next_char & 0xff;
633+
if (char3 < 0x80 || char3 > 0xbf)
634+
UserError("After the quote mark, found the bytes '0x%02X%02X%02X%02X' "
635+
"a legal UTF-8 sequence cannot start this way (file corrupt?)", char0, char1, char2, char3);
636+
symstring.push_back(next_char);
637+
638+
converted = ((char0 & 0x07) << 18) |
639+
((char1 & 0x3F) << 12) |
640+
((char2 & 0x3F) << 6) |
641+
(char3 & 0x3F);
642+
return;
643+
} while (false);
644+
645+
// Here in case of reading errors
646+
if (EOFReached())
647+
UserError("Reached the end of the input inmidst of a UTF-8 sequence (file corrupt?)");
648+
if (Failed())
649+
UserError("Encountered a reading error inmidst of a UTF-8 sequence (file corrupt?)");
650+
}
651+
561652
void AGS::Scanner::ReadInStringLit(std::string &symstring, std::string &valstring)
562653
{
563654
symstring = "\"";

Compiler/script2/cs_scanner.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ class Scanner
9898
// We need that, so we collect in _eofReached the fact that end-of-stream has ever been reached.
9999
bool _eofReached = false;
100100
bool _failed = false;
101+
bool _useUtf8;
101102
std::size_t _lineno;
102103
std::string _section;
103104
SrcList &_tokenList;
@@ -131,6 +132,9 @@ class Scanner
131132
// Translate a '\\' combination into a character, backslash is already read in
132133
void EscapedChar2Char(int first_char_after_backslash, std::string &symstring, int &converted);
133134

135+
// Read a sequence of bytes in UTF-8 format, translate to the respective codepoint
136+
void ReadInCharLit_utf8(int first_char, std::string &symstring, int &converted);
137+
134138
// Read oct combination \777; backslash is already read in
135139
int OctDigits2Char(int first_digit_char, std::string &symstring);
136140

@@ -192,11 +196,14 @@ class Scanner
192196
inline static void ReplaceToken(std::string &where, std::string const &token, std::string const &replacement) { where.replace(where.find(token), token.length(), replacement); }
193197

194198
public:
195-
Scanner(std::string const &input, SrcList &token_list, ccCompiledScript &string_collector, SymbolTable &symt, MessageHandler &messageHandler);
199+
Scanner(std::string const &input, bool use_utf8, SrcList &token_list, ccCompiledScript &string_collector, SymbolTable &symt, MessageHandler &messageHandler);
196200

197201
// Scan the input into token_list; symbols into symt; strings into _string_collector
198202
void Scan();
199203

204+
// Whether UTF-8 encoding is used
205+
inline bool UseUtf8() const { return _useUtf8; }
206+
200207
// Returns whether we've encountered EOF.
201208
inline bool EOFReached() const { return _eofReached; };
202209
// Returns whether we've encountered a read failure

Compiler/test2/cc_parser_test_0.cpp

Lines changed: 31 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
#include "script2/cc_symboltable.h"
2121
#include "script2/cc_internallist.h"
22+
#include "script2/cs_scanner.h"
2223
#include "script2/cs_parser.h"
2324

2425
#include "cc_parser_test_lib.h"
@@ -300,9 +301,12 @@ TEST_F(Compile0, EnumNegative) {
300301
};\
301302
";
302303

303-
// Call cc_scan() and cc_parse() by hand so that we can see the symbol table
304-
ASSERT_LE(0, cc_scan(inpl, targ, scrip, sym, mh));
305-
int compile_result = cc_parse(targ, options, scrip, sym, mh);
304+
// Call scanner and parser by hand so that we can see the symbol table
305+
AGS::Scanner scanner = { inpl, false, targ, scrip, sym, mh };
306+
scanner.Scan();
307+
ASSERT_FALSE(mh.HasError());
308+
AGS::Parser parser = { targ, options, scrip, sym, mh };
309+
parser.Parse();
306310
std::string const &err_msg = mh.GetError().Message;
307311
size_t err_line = mh.GetError().Lineno;
308312
EXPECT_EQ(0u, mh.WarningsCount());
@@ -351,12 +355,14 @@ TEST_F(Compile0, DefaultParametersLargeInts) {
351355
";
352356

353357

354-
ASSERT_LE(0, cc_scan(inpl, targ, scrip, sym, mh));
355-
int compile_result = cc_parse(targ, options, scrip, sym, mh);
358+
AGS::Scanner scanner = { inpl, false, targ, scrip, sym, mh };
359+
scanner.Scan();
360+
ASSERT_FALSE(mh.HasError());
361+
AGS::Parser parser = { targ, options, scrip, sym, mh };
362+
parser.Parse();
356363
std::string const &err_msg = mh.GetError().Message;
357364
size_t err_line = mh.GetError().Lineno;
358365
EXPECT_EQ(0u, mh.WarningsCount());
359-
360366
ASSERT_STREQ("Ok", mh.HasError() ? err_msg.c_str() : "Ok");
361367

362368
AGS::Symbol const funcidx = sym.Find("importedfunc");
@@ -405,12 +411,14 @@ TEST_F(Compile0, ImportFunctionReturningDynamicArray) {
405411
}; \n\
406412
";
407413

408-
ASSERT_LE(0, cc_scan(inpl, targ, scrip, sym, mh));
409-
int compile_result = cc_parse(targ, options, scrip, sym, mh);
414+
AGS::Scanner scanner = { inpl, false, targ, scrip, sym, mh };
415+
scanner.Scan();
416+
ASSERT_FALSE(mh.HasError());
417+
AGS::Parser parser = { targ, options, scrip, sym, mh };
418+
parser.Parse();
410419
std::string const &err_msg = mh.GetError().Message;
411420
size_t err_line = mh.GetError().Lineno;
412421
EXPECT_EQ(0u, mh.WarningsCount());
413-
414422
ASSERT_STREQ("Ok", mh.HasError() ? err_msg.c_str() : "Ok");
415423

416424
int funcidx;
@@ -847,8 +855,13 @@ TEST_F(Compile0, LocalGlobalSeq2) {
847855
AGS::SymbolTable sym;
848856
AGS::FlagSet const options = ~SCOPT_NOIMPORTOVERRIDE | SCOPT_LINENUMBERS;
849857

850-
ASSERT_LE(0, cc_scan(inpl, targ, scrip, sym, mh));
851-
ASSERT_EQ(0, cc_parse(targ, options, scrip, sym, mh));
858+
AGS::Scanner scanner = { inpl, false, targ, scrip, sym, mh };
859+
scanner.Scan();
860+
ASSERT_FALSE(mh.HasError());
861+
AGS::Parser parser = { targ, options, scrip, sym, mh };
862+
parser.Parse();
863+
std::string const &err_msg = mh.GetError().Message;
864+
ASSERT_STREQ("Ok", mh.HasError() ? err_msg.c_str() : "Ok");
852865

853866
ASSERT_LE(1u, mh.GetMessages().size());
854867
EXPECT_EQ(7u, mh.GetMessages()[0].Lineno);
@@ -1850,8 +1863,13 @@ TEST_F(Compile0, Import2GlobalAllocation) {
18501863
AGS::SymbolTable sym;
18511864
AGS::FlagSet const options = ~SCOPT_NOIMPORTOVERRIDE | SCOPT_LINENUMBERS;
18521865

1853-
ASSERT_LE(0, cc_scan(inpl, targ, scrip, sym, mh));
1854-
ASSERT_EQ(0, cc_parse(targ, options, scrip, sym, mh));
1866+
AGS::Scanner scanner = { inpl, false, targ, scrip, sym, mh };
1867+
scanner.Scan();
1868+
ASSERT_FALSE(mh.HasError());
1869+
AGS::Parser parser = { targ, options, scrip, sym, mh };
1870+
parser.Parse();
1871+
std::string const &err_msg = mh.GetError().Message;
1872+
ASSERT_STREQ("Ok", mh.HasError() ? err_msg.c_str() : "Ok");
18551873

18561874
AGS::Symbol const idx = sym.Find("J");
18571875
ASSERT_LE(0, idx);

Compiler/test2/cc_parser_test_lib.h

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -22,21 +22,5 @@
2222
extern char kAgsHeaderString[];
2323
extern char kAgsHeaderBool[];
2424

25-
// Only use this function for googletests. Scan and tokenize the input.
26-
extern int cc_scan(
27-
std::string const &inpl, // preprocessed text to be tokenized
28-
AGS::SrcList &src, // store for the tokenized text
29-
AGS::ccCompiledScript &scrip, // repository for the strings in the text
30-
AGS::SymbolTable &symt, // symbol table
31-
AGS::MessageHandler &mh); // warnings and the error
32-
33-
// Only use this function for googletests. Parse the input
34-
extern int cc_parse(
35-
AGS::SrcList &src, // tokenized text
36-
AGS::FlagSet options, // as defined in cc_options
37-
AGS::ccCompiledScript &scrip, // result of the compilation
38-
AGS::SymbolTable &symt, // symbol table
39-
AGS::MessageHandler &mh); // warnings and the error
40-
4125

4226
#endif

0 commit comments

Comments
 (0)