diff --git a/src/coders/commons.cpp b/src/coders/commons.cpp index de4fb755..74fb87f5 100644 --- a/src/coders/commons.cpp +++ b/src/coders/commons.cpp @@ -214,6 +214,24 @@ std::string_view BasicParser::readUntil(char c) { return source.substr(start, pos - start); } +std::string_view BasicParser::readUntil(std::string_view s) { + int start = pos; + size_t found = source.find(s, pos); + if (found == std::string::npos) { + throw error(util::quote(std::string(s))+" expected"); + } + skip(found - pos); + return source.substr(start, pos - start); +} + +std::string_view BasicParser::readUntilWhitespace() { + int start = pos; + while (hasNext() && !is_whitespace(source[pos])) { + pos++; + } + return source.substr(start, pos - start); +} + std::string_view BasicParser::readUntilEOL() { int start = pos; while (hasNext() && source[pos] != '\r' && source[pos] != '\n') { diff --git a/src/coders/commons.hpp b/src/coders/commons.hpp index abd4c01a..c01601cb 100644 --- a/src/coders/commons.hpp +++ b/src/coders/commons.hpp @@ -105,6 +105,8 @@ protected: parsing_error error(const std::string& message); public: std::string_view readUntil(char c); + std::string_view readUntil(std::string_view s); + std::string_view readUntilWhitespace(); std::string_view readUntilEOL(); std::string parseName(); std::string parseXmlName(); diff --git a/src/coders/lua_parsing.cpp b/src/coders/lua_parsing.cpp new file mode 100644 index 00000000..f3ca21ef --- /dev/null +++ b/src/coders/lua_parsing.cpp @@ -0,0 +1,181 @@ +#include "lua_parsing.hpp" + +#include + +#include "commons.hpp" + +using namespace lua; + +static std::set keywords { + "and", "break", "do", "else", "elseif", "end", "false", "for", "function", + "if", "in", "local", "nil", "not", "or", "repeat", "return", "then", "true", + "until", "while" +}; + +bool lua::is_lua_keyword(std::string_view view) { + return keywords.find(view) != keywords.end(); +} + +inline bool is_lua_identifier_start(int c) { + return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || c == '_'; +} + +inline bool is_lua_identifier_part(int c) { + return is_lua_identifier_start(c) || is_digit(c); +} + +inline bool is_lua_operator_start(int c) { + return c == '=' || c == '~' || c == '+' || c == '-' || c == '/' || c == '*' + || c == '%' || c == '^' || c == '#' || c == '<' || c == '>' || c == ':' + || c == '.'; +} + +class Tokenizer : BasicParser { + std::vector tokens; +public: + Tokenizer(std::string_view file, std::string_view source) + : BasicParser(file, source) { + } + + std::string parseLuaName() { + char c = peek(); + if (!is_identifier_start(c)) { + throw error("identifier expected"); + } + int start = pos; + while (hasNext() && is_identifier_part(source[pos])) { + pos++; + } + return std::string(source.substr(start, pos - start)); + } + + inline Location currentLocation() const { + return Location { + static_cast(pos), + static_cast(linestart), + static_cast(line)}; + } + + void emitToken( + TokenTag tag, std::string name, Location start, bool standalone=false + ) { + tokens.emplace_back( + tag, + std::move(name), + std::move(start), + currentLocation() + ); + if (standalone) skip(1); + } + + /// @brief Get next operator token without checking operator for existing + std::string parseOperator() { + int start = pos; + char first = peek(); + switch (first) { + case '#': case '+': case '/': case '*': case '^': + case '%': + skip(1); + return std::string({first}); + case '-': + skip(1); + if (peekNoJump() == '-') { + skip(1); + return "--"; + } + return std::string({first}); + } + skip(1); + char second = peekNoJump(); + if ((first == '=' && second == '=') || (first == '~' && second == '=') || + (first == '<' && second == '=') || (first == '>' && second == '=')) { + skip(1); + return std::string(source.substr(start, pos - start)); + } + if (first == '.' && second == '.') { + skip(1); + if (peekNoJump() == '.') { + skip(1); + } + } + return std::string(source.substr(start, pos - start)); + } + + std::vector tokenize() { + skipWhitespace(); + while (hasNext()) { + skipWhitespace(); + if (!hasNext()) { + continue; + } + char c = peek(); + auto start = currentLocation(); + if (is_lua_identifier_start(c)) { + auto name = parseLuaName(); + emitToken( + is_lua_keyword(name) ? TokenTag::KEYWORD : TokenTag::NAME, + std::move(name), + start + ); + continue; + } else if (is_digit(c)) { + auto value = parseNumber(1); + auto literal = source.substr(start.pos, pos - start.pos); + emitToken( + value.isInteger() ? TokenTag::INTEGER : TokenTag::NUMBER, + std::string(literal), + start + ); + continue; + } + switch (c) { + case '(': case '[': case '{': + if (isNext("[==[")) { + readUntil("]==]"); + skip(4); + continue; + } else if (isNext("[[")) { + skip(2); + auto string = readUntil("]]"); + skip(2); + emitToken(TokenTag::STRING, std::string(string), start); + continue; + } + emitToken(TokenTag::OPEN_BRACKET, std::string({c}), start, true); + continue; + case ')': case ']': case '}': + emitToken(TokenTag::CLOSE_BRACKET, std::string({c}), start, true); + continue; + case ',': + emitToken(TokenTag::COMMA, std::string({c}), start, true); + continue; + case ';': + emitToken(TokenTag::SEMICOLON, std::string({c}), start, true); + continue; + case '\'': case '"': { + skip(1); + auto string = parseString(c); + emitToken(TokenTag::STRING, std::move(string), start); + continue; + } + default: break; + } + if (is_lua_operator_start(c)) { + auto text = parseOperator(); + if (text == "--") { + skipLine(); + continue; + } + emitToken(TokenTag::OPERATOR, std::move(text), start); + continue; + } + auto text = readUntilWhitespace(); + emitToken(TokenTag::UNEXPECTED, std::string(text), start); + } + return std::move(tokens); + } +}; + +std::vector lua::tokenize(std::string_view file, std::string_view source) { + return Tokenizer(file, source).tokenize(); +} diff --git a/src/coders/lua_parsing.hpp b/src/coders/lua_parsing.hpp new file mode 100644 index 00000000..68cc385a --- /dev/null +++ b/src/coders/lua_parsing.hpp @@ -0,0 +1,35 @@ +#pragma once + +#include +#include + +namespace lua { + struct Location { + int pos; + int lineStart; + int line; + }; + + enum class TokenTag { + KEYWORD, NAME, INTEGER, NUMBER, OPEN_BRACKET, CLOSE_BRACKET, STRING, + OPERATOR, COMMA, SEMICOLON, UNEXPECTED + }; + + struct Token { + TokenTag tag; + std::string text; + Location start; + Location end; + + Token(TokenTag tag, std::string text, Location start, Location end) + : tag(tag), + text(std::move(text)), + start(std::move(start)), + end(std::move(end)) { + } + }; + + bool is_lua_keyword(std::string_view view); + + std::vector tokenize(std::string_view file, std::string_view source); +} diff --git a/test/coders/lua_parsing.cpp b/test/coders/lua_parsing.cpp new file mode 100644 index 00000000..9788d1d7 --- /dev/null +++ b/test/coders/lua_parsing.cpp @@ -0,0 +1,20 @@ +#include + +#include "coders/commons.hpp" +#include "coders/lua_parsing.hpp" +#include "files/files.hpp" +#include "util/stringutil.hpp" + +TEST(lua_parsing, Tokenizer) { + auto filename = "../../res/scripts/stdlib.lua"; + auto source = files::read_string(std::filesystem::u8path(filename)); + try { + auto tokens = lua::tokenize(filename, source); + for (const auto& token : tokens) { + std::cout << (int)token.tag << " " << util::quote(token.text) << std::endl; + } + } catch (const parsing_error& err) { + std::cerr << err.errorLog() << std::endl; + throw err; + } +}