add Lua code tokenizer
This commit is contained in:
parent
3e6e897ce8
commit
28f49ac948
@ -214,6 +214,24 @@ std::string_view BasicParser::readUntil(char c) {
|
||||
return source.substr(start, pos - start);
|
||||
}
|
||||
|
||||
std::string_view BasicParser::readUntil(std::string_view s) {
|
||||
int start = pos;
|
||||
size_t found = source.find(s, pos);
|
||||
if (found == std::string::npos) {
|
||||
throw error(util::quote(std::string(s))+" expected");
|
||||
}
|
||||
skip(found - pos);
|
||||
return source.substr(start, pos - start);
|
||||
}
|
||||
|
||||
std::string_view BasicParser::readUntilWhitespace() {
|
||||
int start = pos;
|
||||
while (hasNext() && !is_whitespace(source[pos])) {
|
||||
pos++;
|
||||
}
|
||||
return source.substr(start, pos - start);
|
||||
}
|
||||
|
||||
std::string_view BasicParser::readUntilEOL() {
|
||||
int start = pos;
|
||||
while (hasNext() && source[pos] != '\r' && source[pos] != '\n') {
|
||||
|
||||
@ -105,6 +105,8 @@ protected:
|
||||
parsing_error error(const std::string& message);
|
||||
public:
|
||||
std::string_view readUntil(char c);
|
||||
std::string_view readUntil(std::string_view s);
|
||||
std::string_view readUntilWhitespace();
|
||||
std::string_view readUntilEOL();
|
||||
std::string parseName();
|
||||
std::string parseXmlName();
|
||||
|
||||
181
src/coders/lua_parsing.cpp
Normal file
181
src/coders/lua_parsing.cpp
Normal file
@ -0,0 +1,181 @@
|
||||
#include "lua_parsing.hpp"
|
||||
|
||||
#include <set>
|
||||
|
||||
#include "commons.hpp"
|
||||
|
||||
using namespace lua;
|
||||
|
||||
static std::set<std::string_view> keywords {
|
||||
"and", "break", "do", "else", "elseif", "end", "false", "for", "function",
|
||||
"if", "in", "local", "nil", "not", "or", "repeat", "return", "then", "true",
|
||||
"until", "while"
|
||||
};
|
||||
|
||||
bool lua::is_lua_keyword(std::string_view view) {
|
||||
return keywords.find(view) != keywords.end();
|
||||
}
|
||||
|
||||
inline bool is_lua_identifier_start(int c) {
|
||||
return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || c == '_';
|
||||
}
|
||||
|
||||
inline bool is_lua_identifier_part(int c) {
|
||||
return is_lua_identifier_start(c) || is_digit(c);
|
||||
}
|
||||
|
||||
inline bool is_lua_operator_start(int c) {
|
||||
return c == '=' || c == '~' || c == '+' || c == '-' || c == '/' || c == '*'
|
||||
|| c == '%' || c == '^' || c == '#' || c == '<' || c == '>' || c == ':'
|
||||
|| c == '.';
|
||||
}
|
||||
|
||||
class Tokenizer : BasicParser {
|
||||
std::vector<Token> tokens;
|
||||
public:
|
||||
Tokenizer(std::string_view file, std::string_view source)
|
||||
: BasicParser(file, source) {
|
||||
}
|
||||
|
||||
std::string parseLuaName() {
|
||||
char c = peek();
|
||||
if (!is_identifier_start(c)) {
|
||||
throw error("identifier expected");
|
||||
}
|
||||
int start = pos;
|
||||
while (hasNext() && is_identifier_part(source[pos])) {
|
||||
pos++;
|
||||
}
|
||||
return std::string(source.substr(start, pos - start));
|
||||
}
|
||||
|
||||
inline Location currentLocation() const {
|
||||
return Location {
|
||||
static_cast<int>(pos),
|
||||
static_cast<int>(linestart),
|
||||
static_cast<int>(line)};
|
||||
}
|
||||
|
||||
void emitToken(
|
||||
TokenTag tag, std::string name, Location start, bool standalone=false
|
||||
) {
|
||||
tokens.emplace_back(
|
||||
tag,
|
||||
std::move(name),
|
||||
std::move(start),
|
||||
currentLocation()
|
||||
);
|
||||
if (standalone) skip(1);
|
||||
}
|
||||
|
||||
/// @brief Get next operator token without checking operator for existing
|
||||
std::string parseOperator() {
|
||||
int start = pos;
|
||||
char first = peek();
|
||||
switch (first) {
|
||||
case '#': case '+': case '/': case '*': case '^':
|
||||
case '%':
|
||||
skip(1);
|
||||
return std::string({first});
|
||||
case '-':
|
||||
skip(1);
|
||||
if (peekNoJump() == '-') {
|
||||
skip(1);
|
||||
return "--";
|
||||
}
|
||||
return std::string({first});
|
||||
}
|
||||
skip(1);
|
||||
char second = peekNoJump();
|
||||
if ((first == '=' && second == '=') || (first == '~' && second == '=') ||
|
||||
(first == '<' && second == '=') || (first == '>' && second == '=')) {
|
||||
skip(1);
|
||||
return std::string(source.substr(start, pos - start));
|
||||
}
|
||||
if (first == '.' && second == '.') {
|
||||
skip(1);
|
||||
if (peekNoJump() == '.') {
|
||||
skip(1);
|
||||
}
|
||||
}
|
||||
return std::string(source.substr(start, pos - start));
|
||||
}
|
||||
|
||||
std::vector<Token> tokenize() {
|
||||
skipWhitespace();
|
||||
while (hasNext()) {
|
||||
skipWhitespace();
|
||||
if (!hasNext()) {
|
||||
continue;
|
||||
}
|
||||
char c = peek();
|
||||
auto start = currentLocation();
|
||||
if (is_lua_identifier_start(c)) {
|
||||
auto name = parseLuaName();
|
||||
emitToken(
|
||||
is_lua_keyword(name) ? TokenTag::KEYWORD : TokenTag::NAME,
|
||||
std::move(name),
|
||||
start
|
||||
);
|
||||
continue;
|
||||
} else if (is_digit(c)) {
|
||||
auto value = parseNumber(1);
|
||||
auto literal = source.substr(start.pos, pos - start.pos);
|
||||
emitToken(
|
||||
value.isInteger() ? TokenTag::INTEGER : TokenTag::NUMBER,
|
||||
std::string(literal),
|
||||
start
|
||||
);
|
||||
continue;
|
||||
}
|
||||
switch (c) {
|
||||
case '(': case '[': case '{':
|
||||
if (isNext("[==[")) {
|
||||
readUntil("]==]");
|
||||
skip(4);
|
||||
continue;
|
||||
} else if (isNext("[[")) {
|
||||
skip(2);
|
||||
auto string = readUntil("]]");
|
||||
skip(2);
|
||||
emitToken(TokenTag::STRING, std::string(string), start);
|
||||
continue;
|
||||
}
|
||||
emitToken(TokenTag::OPEN_BRACKET, std::string({c}), start, true);
|
||||
continue;
|
||||
case ')': case ']': case '}':
|
||||
emitToken(TokenTag::CLOSE_BRACKET, std::string({c}), start, true);
|
||||
continue;
|
||||
case ',':
|
||||
emitToken(TokenTag::COMMA, std::string({c}), start, true);
|
||||
continue;
|
||||
case ';':
|
||||
emitToken(TokenTag::SEMICOLON, std::string({c}), start, true);
|
||||
continue;
|
||||
case '\'': case '"': {
|
||||
skip(1);
|
||||
auto string = parseString(c);
|
||||
emitToken(TokenTag::STRING, std::move(string), start);
|
||||
continue;
|
||||
}
|
||||
default: break;
|
||||
}
|
||||
if (is_lua_operator_start(c)) {
|
||||
auto text = parseOperator();
|
||||
if (text == "--") {
|
||||
skipLine();
|
||||
continue;
|
||||
}
|
||||
emitToken(TokenTag::OPERATOR, std::move(text), start);
|
||||
continue;
|
||||
}
|
||||
auto text = readUntilWhitespace();
|
||||
emitToken(TokenTag::UNEXPECTED, std::string(text), start);
|
||||
}
|
||||
return std::move(tokens);
|
||||
}
|
||||
};
|
||||
|
||||
std::vector<Token> lua::tokenize(std::string_view file, std::string_view source) {
|
||||
return Tokenizer(file, source).tokenize();
|
||||
}
|
||||
35
src/coders/lua_parsing.hpp
Normal file
35
src/coders/lua_parsing.hpp
Normal file
@ -0,0 +1,35 @@
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace lua {
|
||||
struct Location {
|
||||
int pos;
|
||||
int lineStart;
|
||||
int line;
|
||||
};
|
||||
|
||||
enum class TokenTag {
|
||||
KEYWORD, NAME, INTEGER, NUMBER, OPEN_BRACKET, CLOSE_BRACKET, STRING,
|
||||
OPERATOR, COMMA, SEMICOLON, UNEXPECTED
|
||||
};
|
||||
|
||||
struct Token {
|
||||
TokenTag tag;
|
||||
std::string text;
|
||||
Location start;
|
||||
Location end;
|
||||
|
||||
Token(TokenTag tag, std::string text, Location start, Location end)
|
||||
: tag(tag),
|
||||
text(std::move(text)),
|
||||
start(std::move(start)),
|
||||
end(std::move(end)) {
|
||||
}
|
||||
};
|
||||
|
||||
bool is_lua_keyword(std::string_view view);
|
||||
|
||||
std::vector<Token> tokenize(std::string_view file, std::string_view source);
|
||||
}
|
||||
20
test/coders/lua_parsing.cpp
Normal file
20
test/coders/lua_parsing.cpp
Normal file
@ -0,0 +1,20 @@
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "coders/commons.hpp"
|
||||
#include "coders/lua_parsing.hpp"
|
||||
#include "files/files.hpp"
|
||||
#include "util/stringutil.hpp"
|
||||
|
||||
TEST(lua_parsing, Tokenizer) {
|
||||
auto filename = "../../res/scripts/stdlib.lua";
|
||||
auto source = files::read_string(std::filesystem::u8path(filename));
|
||||
try {
|
||||
auto tokens = lua::tokenize(filename, source);
|
||||
for (const auto& token : tokens) {
|
||||
std::cout << (int)token.tag << " " << util::quote(token.text) << std::endl;
|
||||
}
|
||||
} catch (const parsing_error& err) {
|
||||
std::cerr << err.errorLog() << std::endl;
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user