Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 11 additions & 19 deletions Compiler/script2/cs_parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7722,21 +7722,6 @@ void AGS::Parser::Parse()
}
}

// Scan inpl into scan tokens, build a symbol table
int cc_scan(std::string const &inpl, AGS::SrcList &src, AGS::ccCompiledScript &scrip, AGS::SymbolTable &symt, AGS::MessageHandler &mh)
{
AGS::Scanner scanner = { inpl, src, scrip, symt, mh };
scanner.Scan();
return -static_cast<int>(mh.HasError());
}

int cc_parse(AGS::SrcList &src, AGS::FlagSet options, AGS::ccCompiledScript &scrip, AGS::SymbolTable &symt, AGS::MessageHandler &mh)
{
AGS::Parser parser = { src, options, scrip, symt, mh };
parser.Parse();
return -static_cast<int>(mh.HasError());
}

int cc_compile(std::string const &inpl, AGS::FlagSet options, AGS::ccCompiledScript &scrip, AGS::MessageHandler &mh)
{
AGS::SymbolTable local_symt;
Expand All @@ -7754,9 +7739,16 @@ int cc_compile(std::string const &inpl, AGS::FlagSet options, AGS::ccCompiledScr
src.NewSection("UnnamedSection");
src.NewLine(1u);

int error_code = cc_scan(inpl, src, scrip, symt, mh);
if (error_code >= 0)
error_code = cc_parse(src, options, scrip, symt, mh);
AGS::Scanner scanner = { inpl, FlagIsSet(options, SCOPT_UTF8), src, scrip, symt, mh };
scanner.Scan();
if (mh.HasError())
return -1;

AGS::Parser parser = { src, options, scrip, symt, mh };
parser.Parse();
if (mh.HasError())
return -2;

sections = lh.CreateSectionList();
return error_code;
return 0;
}
97 changes: 94 additions & 3 deletions Compiler/script2/cs_scanner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,9 @@
std::string const AGS::Scanner::kNewSectionLitPrefix = "__NEWSCRIPTSTART_";
size_t const AGS::Scanner::kNewSectionLitPrefixSize = 17u;

AGS::Scanner::Scanner(std::string const &input, SrcList &token_list, ccCompiledScript &string_collector, SymbolTable &symt, MessageHandler &messageHandler)
: _ocMatcher(*this)
AGS::Scanner::Scanner(std::string const &input, bool const use_utf8, SrcList &token_list, ccCompiledScript &string_collector, SymbolTable &symt, MessageHandler &messageHandler)
: _useUtf8(use_utf8)
, _ocMatcher(*this)
, _lineno(1u)
, _tokenList(token_list)
, _msgHandler(messageHandler)
Expand Down Expand Up @@ -345,7 +346,7 @@ void AGS::Scanner::ReadInNumberLit(std::string &symstring, ScanType &scan_type,

errno = 0;
char *endptr;
std::strtof(valstring.c_str(), &endptr);
(void) std::strtof(valstring.c_str(), &endptr);
bool const can_be_a_floating_point =
(errno == 0 || errno == ERANGE) && // range errors will be treated below
(valstring.length() == endptr - valstring.c_str()); // ensure that all chars are used up in the conversion
Expand Down Expand Up @@ -459,6 +460,11 @@ void AGS::Scanner::ReadInCharLit(std::string &symstring, CodeCell &value)

EscapedChar2Char(lit_char, symstring, lit_char);
}
else if (UseUtf8() && (lit_char < 0 || lit_char > 0x7f))
{
// Accept an UTF-8 sequence, use the codepoint that corresponds to the sequence
ReadInCharLit_utf8(lit_char, symstring, lit_char);
}

// Closing '\''
int const closer = Get();
Expand Down Expand Up @@ -558,6 +564,91 @@ void AGS::Scanner::EscapedChar2Char(int first_char_after_backslash, std::string
UserError("Unrecognized '\\%c' in character or string literal", first_char_after_backslash);
}

void AGS::Scanner::ReadInCharLit_utf8(int const first_char, std::string &symstring, int &converted)
{
unsigned int const char0 = first_char & 0xff;
if (char0 <= 0x7f)
{
// No UTF-8 sequence
symstring.push_back(first_char);
converted = first_char;
return;
}

if (char0 < 0xc2 || char0 > 0xf4)
UserError("After the quote mark, found the byte '0x%02X'; "
"a legal UTF-8 sequence cannot start this way (file corrupt?)", char0);

do // exactly 1 time
{
int next_char = Get();
if (EOFReached() || Failed())
break; // to error msg
symstring.push_back(next_char);
unsigned int const char1 = next_char & 0xff;

// Strictly going by the algo, you can encode some characters
// in different ways, but the standard doesn't allow this:
// You must always use the shortest encoding possible.
// Thus, some combinations of first and second byte are forbidden.
if (char1 < 0x80 ||
char1 > 0xbf ||
char0 == 0xe0 && char1 <= 0x9f ||
char0 == 0xed && char1 >= 0xa0 ||
char0 == 0xf0 && char1 <= 0x8f ||
char0 == 0xf4 && char1 >= 0x90)
UserError("After the quote mark, found the bytes '0x%02X%02X'; "
"a legal UTF-8 sequence cannot start this way (file corrupt?)", char0, char1);

if (char0 <= 0xdf)
{
// 2-byte sequence
converted = ((char0 & 0x1f) << 6) + (char1 & 0x3f);
return;
}

next_char = Get();
if (EOFReached() || Failed())
break; // to error msg
unsigned int const char2 = next_char & 0xff;
if (char2 < 0x80 || char2 > 0xbf)
UserError("After the quote mark, found the bytes '0x%02X%02X%02X'; "
"a legal UTF-8 sequence cannot start this way (file corrupt?)", char0, char1, char2);
symstring.push_back(next_char);

if (char0 <= 0xef)
{
// 3-byte sequence
converted = ((char0 & 0x0F) << 12) |
((char1 & 0x3F) << 6) |
(char2 & 0x3F);
return;
}

// 4-byte sequence
next_char = Get();
if (EOFReached() || Failed())
break; // to error msg
unsigned int const char3 = next_char & 0xff;
if (char3 < 0x80 || char3 > 0xbf)
UserError("After the quote mark, found the bytes '0x%02X%02X%02X%02X' "
"a legal UTF-8 sequence cannot start this way (file corrupt?)", char0, char1, char2, char3);
symstring.push_back(next_char);

converted = ((char0 & 0x07) << 18) |
((char1 & 0x3F) << 12) |
((char2 & 0x3F) << 6) |
(char3 & 0x3F);
return;
} while (false);

// Here in case of reading errors
if (EOFReached())
UserError("Reached the end of the input inmidst of a UTF-8 sequence (file corrupt?)");
if (Failed())
UserError("Encountered a reading error inmidst of a UTF-8 sequence (file corrupt?)");
}

void AGS::Scanner::ReadInStringLit(std::string &symstring, std::string &valstring)
{
symstring = "\"";
Expand Down
9 changes: 8 additions & 1 deletion Compiler/script2/cs_scanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ class Scanner
// We need that, so we collect in _eofReached the fact that end-of-stream has ever been reached.
bool _eofReached = false;
bool _failed = false;
bool _useUtf8;
std::size_t _lineno;
std::string _section;
SrcList &_tokenList;
Expand Down Expand Up @@ -131,6 +132,9 @@ class Scanner
// Translate a '\\' combination into a character, backslash is already read in
void EscapedChar2Char(int first_char_after_backslash, std::string &symstring, int &converted);

// Read a sequence of bytes in UTF-8 format, translate to the respective codepoint
void ReadInCharLit_utf8(int first_char, std::string &symstring, int &converted);

// Read oct combination \777; backslash is already read in
int OctDigits2Char(int first_digit_char, std::string &symstring);

Expand Down Expand Up @@ -192,11 +196,14 @@ class Scanner
inline static void ReplaceToken(std::string &where, std::string const &token, std::string const &replacement) { where.replace(where.find(token), token.length(), replacement); }

public:
Scanner(std::string const &input, SrcList &token_list, ccCompiledScript &string_collector, SymbolTable &symt, MessageHandler &messageHandler);
Scanner(std::string const &input, bool use_utf8, SrcList &token_list, ccCompiledScript &string_collector, SymbolTable &symt, MessageHandler &messageHandler);

// Scan the input into token_list; symbols into symt; strings into _string_collector
void Scan();

// Whether UTF-8 encoding is used
inline bool UseUtf8() const { return _useUtf8; }

// Returns whether we've encountered EOF.
inline bool EOFReached() const { return _eofReached; };
// Returns whether we've encountered a read failure
Expand Down
44 changes: 31 additions & 13 deletions Compiler/test2/cc_parser_test_0.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

#include "script2/cc_symboltable.h"
#include "script2/cc_internallist.h"
#include "script2/cs_scanner.h"
#include "script2/cs_parser.h"

#include "cc_parser_test_lib.h"
Expand Down Expand Up @@ -300,9 +301,12 @@ TEST_F(Compile0, EnumNegative) {
};\
";

// Call cc_scan() and cc_parse() by hand so that we can see the symbol table
ASSERT_LE(0, cc_scan(inpl, targ, scrip, sym, mh));
int compile_result = cc_parse(targ, options, scrip, sym, mh);
// Call scanner and parser by hand so that we can see the symbol table
AGS::Scanner scanner = { inpl, false, targ, scrip, sym, mh };
scanner.Scan();
ASSERT_FALSE(mh.HasError());
AGS::Parser parser = { targ, options, scrip, sym, mh };
parser.Parse();
std::string const &err_msg = mh.GetError().Message;
size_t err_line = mh.GetError().Lineno;
EXPECT_EQ(0u, mh.WarningsCount());
Expand Down Expand Up @@ -351,12 +355,14 @@ TEST_F(Compile0, DefaultParametersLargeInts) {
";


ASSERT_LE(0, cc_scan(inpl, targ, scrip, sym, mh));
int compile_result = cc_parse(targ, options, scrip, sym, mh);
AGS::Scanner scanner = { inpl, false, targ, scrip, sym, mh };
scanner.Scan();
ASSERT_FALSE(mh.HasError());
AGS::Parser parser = { targ, options, scrip, sym, mh };
parser.Parse();
std::string const &err_msg = mh.GetError().Message;
size_t err_line = mh.GetError().Lineno;
EXPECT_EQ(0u, mh.WarningsCount());

ASSERT_STREQ("Ok", mh.HasError() ? err_msg.c_str() : "Ok");

AGS::Symbol const funcidx = sym.Find("importedfunc");
Expand Down Expand Up @@ -405,12 +411,14 @@ TEST_F(Compile0, ImportFunctionReturningDynamicArray) {
}; \n\
";

ASSERT_LE(0, cc_scan(inpl, targ, scrip, sym, mh));
int compile_result = cc_parse(targ, options, scrip, sym, mh);
AGS::Scanner scanner = { inpl, false, targ, scrip, sym, mh };
scanner.Scan();
ASSERT_FALSE(mh.HasError());
AGS::Parser parser = { targ, options, scrip, sym, mh };
parser.Parse();
std::string const &err_msg = mh.GetError().Message;
size_t err_line = mh.GetError().Lineno;
EXPECT_EQ(0u, mh.WarningsCount());

ASSERT_STREQ("Ok", mh.HasError() ? err_msg.c_str() : "Ok");

int funcidx;
Expand Down Expand Up @@ -847,8 +855,13 @@ TEST_F(Compile0, LocalGlobalSeq2) {
AGS::SymbolTable sym;
AGS::FlagSet const options = ~SCOPT_NOIMPORTOVERRIDE | SCOPT_LINENUMBERS;

ASSERT_LE(0, cc_scan(inpl, targ, scrip, sym, mh));
ASSERT_EQ(0, cc_parse(targ, options, scrip, sym, mh));
AGS::Scanner scanner = { inpl, false, targ, scrip, sym, mh };
scanner.Scan();
ASSERT_FALSE(mh.HasError());
AGS::Parser parser = { targ, options, scrip, sym, mh };
parser.Parse();
std::string const &err_msg = mh.GetError().Message;
ASSERT_STREQ("Ok", mh.HasError() ? err_msg.c_str() : "Ok");

ASSERT_LE(1u, mh.GetMessages().size());
EXPECT_EQ(7u, mh.GetMessages()[0].Lineno);
Expand Down Expand Up @@ -1850,8 +1863,13 @@ TEST_F(Compile0, Import2GlobalAllocation) {
AGS::SymbolTable sym;
AGS::FlagSet const options = ~SCOPT_NOIMPORTOVERRIDE | SCOPT_LINENUMBERS;

ASSERT_LE(0, cc_scan(inpl, targ, scrip, sym, mh));
ASSERT_EQ(0, cc_parse(targ, options, scrip, sym, mh));
AGS::Scanner scanner = { inpl, false, targ, scrip, sym, mh };
scanner.Scan();
ASSERT_FALSE(mh.HasError());
AGS::Parser parser = { targ, options, scrip, sym, mh };
parser.Parse();
std::string const &err_msg = mh.GetError().Message;
ASSERT_STREQ("Ok", mh.HasError() ? err_msg.c_str() : "Ok");

AGS::Symbol const idx = sym.Find("J");
ASSERT_LE(0, idx);
Expand Down
16 changes: 0 additions & 16 deletions Compiler/test2/cc_parser_test_lib.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,21 +22,5 @@
extern char kAgsHeaderString[];
extern char kAgsHeaderBool[];

// Only use this function for googletests. Scan and tokenize the input.
extern int cc_scan(
std::string const &inpl, // preprocessed text to be tokenized
AGS::SrcList &src, // store for the tokenized text
AGS::ccCompiledScript &scrip, // repository for the strings in the text
AGS::SymbolTable &symt, // symbol table
AGS::MessageHandler &mh); // warnings and the error

// Only use this function for googletests. Parse the input
extern int cc_parse(
AGS::SrcList &src, // tokenized text
AGS::FlagSet options, // as defined in cc_options
AGS::ccCompiledScript &scrip, // result of the compilation
AGS::SymbolTable &symt, // symbol table
AGS::MessageHandler &mh); // warnings and the error


#endif
Loading