From 8a8c1525fdf634d848db47ff23f656676b3d71e2 Mon Sep 17 00:00:00 2001 From: MihailRis Date: Sat, 22 Feb 2025 01:01:20 +0300 Subject: [PATCH] Fix syntax highlighting unicode support (#475) * convert BasicParser to a template * fix syntax hightlighting with unicode characters --- src/coders/BasicParser.hpp | 53 +++ src/coders/BasicParser.inl | 461 +++++++++++++++++++++++++++ src/coders/commons.cpp | 405 ++--------------------- src/coders/commons.hpp | 69 +--- src/coders/json.cpp | 4 +- src/coders/lua_parsing.cpp | 72 +++-- src/coders/lua_parsing.hpp | 4 +- src/coders/obj.cpp | 4 +- src/coders/toml.cpp | 4 +- src/coders/xml.cpp | 3 +- src/devtools/syntax.hpp | 4 +- src/devtools/syntax_highlighting.cpp | 2 +- src/devtools/syntax_highlighting.hpp | 2 +- src/frontend/locale.cpp | 4 +- src/graphics/ui/elements/TextBox.cpp | 4 +- src/logic/CommandsInterpreter.cpp | 4 +- src/util/stringutil.cpp | 30 +- src/util/stringutil.hpp | 18 +- test/coders/lua_parsing.cpp | 6 +- 19 files changed, 644 insertions(+), 509 deletions(-) create mode 100644 src/coders/BasicParser.hpp create mode 100644 src/coders/BasicParser.inl diff --git a/src/coders/BasicParser.hpp b/src/coders/BasicParser.hpp new file mode 100644 index 00000000..e3a3dda8 --- /dev/null +++ b/src/coders/BasicParser.hpp @@ -0,0 +1,53 @@ +#pragma once + +#include "commons.hpp" +#include "data/dv.hpp" + +template +class BasicParser { + using StringT = std::basic_string; + using StringViewT = std::basic_string_view; +protected: + std::string_view filename; + StringViewT source; + uint pos = 0; + uint line = 1; + uint linestart = 0; + + virtual void skipWhitespace(); + void skip(size_t n); + void skipLine(); + bool skipTo(const StringT& substring); + void expect(CharT expected); + void expect(const StringT& substring); + bool isNext(const StringT& substring); + void expectNewLine(); + void goBack(size_t count = 1); + void reset(); + + int64_t parseSimpleInt(int base); + dv::value parseNumber(int sign); + dv::value parseNumber(); + StringT parseString(CharT chr, bool closeRequired = true); + + parsing_error error(const std::string& message); +public: + StringViewT readUntil(CharT c); + StringViewT readUntil(StringViewT s, bool nothrow); + StringViewT readUntilWhitespace(); + StringViewT readUntilEOL(); + StringT parseName(); + StringT parseXmlName(); + bool hasNext(); + size_t remain() const; + CharT peek(); + CharT peekInLine(); + CharT peekNoJump(); + CharT nextChar(); + + BasicParser(std::string_view file, StringViewT source) + : filename(file), source(source) { + } +}; + +#include "BasicParser.inl" diff --git a/src/coders/BasicParser.inl b/src/coders/BasicParser.inl new file mode 100644 index 00000000..40161e5a --- /dev/null +++ b/src/coders/BasicParser.inl @@ -0,0 +1,461 @@ +#include "BasicParser.hpp" + +#include +#include + +#include "util/stringutil.hpp" + +namespace { + int is_box(int c) { + switch (c) { + case 'B': + case 'b': + return 2; + case 'O': + case 'o': + return 8; + case 'X': + case 'x': + return 16; + } + return 10; + } + + double power(double base, int64_t power) { + double result = 1.0; + for (int64_t i = 0; i < power; i++) { + result *= base; + } + return result; + } +} + +template +void BasicParser::skipWhitespace() { + while (hasNext()) { + char next = source[pos]; + if (next == '\n') { + line++; + linestart = ++pos; + continue; + } + if (is_whitespace(next)) { + pos++; + } else { + break; + } + } +} + +template +void BasicParser::skip(size_t n) { + n = std::min(n, source.length() - pos); + + for (size_t i = 0; i < n; i++) { + char next = source[pos++]; + if (next == '\n') { + line++; + linestart = pos; + } + } +} + +template +void BasicParser::skipLine() { + while (hasNext()) { + if (source[pos] == '\n') { + pos++; + linestart = pos; + line++; + break; + } + pos++; + } +} + +template +bool BasicParser::skipTo(const std::basic_string& substring) { + size_t idx = source.find(substring, pos); + if (idx == std::string::npos) { + skip(source.length() - pos); + return false; + } else { + skip(idx - pos); + return true; + } +} + +template +bool BasicParser::hasNext() { + return pos < source.length(); +} + +template +size_t BasicParser::remain() const { + return source.length() - pos; +} + +template +bool BasicParser::isNext(const std::basic_string& substring) { + if (source.length() - pos < substring.length()) { + return false; + } + return source.substr(pos, substring.length()) == substring; +} + +template +CharT BasicParser::nextChar() { + if (!hasNext()) { + throw error("unexpected end"); + } + return source[pos++]; +} + +template +void BasicParser::expect(CharT expected) { + char c = peek(); + if (c != expected) { + throw error( + "'" + std::string({static_cast(expected)}) + "' expected" + ); + } + pos++; +} + +template +void BasicParser::expect(const std::basic_string& substring) { + if (substring.empty()) return; + for (uint i = 0; i < substring.length(); i++) { + if (source.length() <= pos + i || source[pos + i] != substring[i]) { + throw error( + util::quote(util::str2str_utf8(substring)) + " expected" + ); + } + } + pos += substring.length(); +} + +template +void BasicParser::expectNewLine() { + while (hasNext()) { + char next = source[pos]; + if (next == '\n') { + line++; + linestart = ++pos; + return; + } + if (is_whitespace(next)) { + pos++; + } else { + throw error("line separator expected"); + } + } +} + +template +void BasicParser::goBack(size_t count) { + if (pos < count) { + throw std::runtime_error("pos < jump"); + } + if (pos) { + pos -= count; + } +} + +template void BasicParser::reset() { + pos = 0; +} + +template +CharT BasicParser::peekInLine() { + while (hasNext()) { + CharT next = source[pos]; + if (next == '\n') { + return next; + } + if (is_whitespace(next)) { + pos++; + } else { + break; + } + } + if (pos >= source.length()) { + throw error("unexpected end"); + } + return source[pos]; +} + +template +CharT BasicParser::peek() { + skipWhitespace(); + if (pos >= source.length()) { + throw error("unexpected end"); + } + return source[pos]; +} + +template +CharT BasicParser::peekNoJump() { + if (pos >= source.length()) { + throw error("unexpected end"); + } + return source[pos]; +} + +template +std::basic_string_view BasicParser::readUntil(CharT c) { + int start = pos; + while (hasNext() && source[pos] != c) { + pos++; + } + return source.substr(start, pos - start); +} + +template +std::basic_string_view BasicParser::readUntil( + std::basic_string_view s, bool nothrow +) { + int start = pos; + size_t found = source.find(s, pos); + if (found == std::string::npos) { + if (nothrow) { + pos = source.size(); + return source.substr(start); + } + throw error(util::quote(util::str2str_utf8(s)) + " expected"); + } + skip(found - pos); + return source.substr(start, pos - start); +} + +template +std::basic_string_view BasicParser::readUntilWhitespace() { + int start = pos; + while (hasNext() && !is_whitespace(source[pos])) { + pos++; + } + return source.substr(start, pos - start); +} + +template +std::basic_string_view BasicParser::readUntilEOL() { + int start = pos; + while (hasNext() && source[pos] != '\r' && source[pos] != '\n') { + pos++; + } + return source.substr(start, pos - start); +} + +template +std::basic_string BasicParser::parseName() { + char c = peek(); + if (!is_identifier_start(c)) { + throw error("identifier expected"); + } + int start = pos; + while (hasNext() && is_identifier_part(source[pos])) { + pos++; + } + return std::basic_string(source.substr(start, pos - start)); +} + +template +std::basic_string BasicParser::parseXmlName() { + CharT c = peek(); + if (!is_json_identifier_start(c)) { + throw error("identifier expected"); + } + int start = pos; + while (hasNext() && is_json_identifier_part(source[pos])) { + pos++; + } + return std::basic_string(source.substr(start, pos - start)); +} + +template +int64_t BasicParser::parseSimpleInt(int base) { + CharT c = peek(); + int index = hexchar2int(c); + if (index == -1 || index >= base) { + throw error("invalid number literal"); + } + int64_t value = index; + pos++; + while (hasNext()) { + c = source[pos]; + while (c == '_') { + c = source[++pos]; + } + index = hexchar2int(c); + if (index == -1 || index >= base) { + return value; + } + value *= base; + value += index; + pos++; + } + return value; +} + +template +dv::value BasicParser::parseNumber() { + switch (peek()) { + case '-': + skip(1); + return parseNumber(-1); + case '+': + skip(1); + return parseNumber(1); + default: + return parseNumber(1); + } +} + +template +dv::value BasicParser::parseNumber(int sign) { + CharT c = peek(); + int base = 10; + if (c == '0' && pos + 1 < source.length() && + (base = is_box(source[pos + 1])) != 10) { + pos += 2; + return parseSimpleInt(base); + } else if (c == 'i' && pos + 2 < source.length() && source[pos + 1] == 'n' && source[pos + 2] == 'f') { + pos += 3; + return INFINITY * sign; + } else if (c == 'n' && pos + 2 < source.length() && source[pos + 1] == 'a' && source[pos + 2] == 'n') { + pos += 3; + return NAN * sign; + } + int64_t value = parseSimpleInt(base); + if (!hasNext()) { + return value * sign; + } + c = source[pos]; + if (c == 'e' || c == 'E') { + pos++; + int s = 1; + if (peek() == '-') { + s = -1; + pos++; + } else if (peek() == '+') { + pos++; + } + return sign * value * power(10.0, s * parseSimpleInt(10)); + } + if (c == '.') { + pos++; + int64_t expo = 1; + while (hasNext() && source[pos] == '0') { + expo *= 10; + pos++; + } + int64_t afterdot = 0; + if (hasNext() && is_digit(source[pos])) { + afterdot = parseSimpleInt(10); + } + expo *= power( + 10, std::max( + static_cast(0), + static_cast(std::log10(afterdot) + 1) + ) + ); + c = source[pos]; + + double dvalue = (value + (afterdot / (double)expo)); + if (c == 'e' || c == 'E') { + pos++; + int s = 1; + if (peek() == '-') { + s = -1; + pos++; + } else if (peek() == '+') { + pos++; + } + return sign * dvalue * power(10.0, s * parseSimpleInt(10)); + } + return sign * dvalue; + } + return sign * value; +} + +template +std::basic_string BasicParser::parseString( + CharT quote, bool closeRequired +) { + std::basic_stringstream ss; + while (hasNext()) { + CharT c = source[pos]; + if (c == quote) { + pos++; + return ss.str(); + } + if (c == '\\') { + pos++; + c = nextChar(); + if (c >= '0' && c <= '7') { + pos--; + ss << (char)parseSimpleInt(8); + continue; + } + if (c == 'u') { + int codepoint = parseSimpleInt(16); + ubyte bytes[4]; + int size = util::encode_utf8(codepoint, bytes); + CharT chars[4]; + for (int i = 0; i < 4; i++) { + chars[i] = bytes[i]; + } + ss.write(chars, size); + continue; + } + switch (c) { + case 'n': ss << '\n'; break; + case 'r': ss << '\r'; break; + case 'b': ss << '\b'; break; + case 't': ss << '\t'; break; + case 'f': ss << '\f'; break; + case '\'': ss << '\''; break; + case '"': ss << '"'; break; + case '\\': ss << '\\'; break; + case '/': ss << '/'; break; + case '\n': continue; + default: + throw error( + "'\\" + + util::str2str_utf8(std::basic_string({c})) + + "' is an illegal escape" + ); + } + continue; + } + if (c == '\n' && closeRequired) { + throw error("non-closed string literal"); + } + ss << c; + pos++; + } + if (closeRequired) { + throw error("unexpected end"); + } + return ss.str(); +} + +template <> +inline parsing_error BasicParser::error(const std::string& message) { + return parsing_error(message, filename, source, pos, line, linestart); +} + +template <> +inline parsing_error BasicParser::error(const std::string& message) { + size_t utf8pos = util::length_utf8(source.substr(0, pos)); + size_t utf8linestart = + utf8pos - util::length_utf8(source.substr(linestart, pos)); + return parsing_error( + message, + filename, + util::str2str_utf8(source), + utf8pos, + line, + utf8linestart + ); +} diff --git a/src/coders/commons.cpp b/src/coders/commons.cpp index cd9a2d4c..273f74f5 100644 --- a/src/coders/commons.cpp +++ b/src/coders/commons.cpp @@ -1,20 +1,9 @@ #include "commons.hpp" - -#include - #include #include #include "util/stringutil.hpp" -inline double power(double base, int64_t power) { - double result = 1.0; - for (int64_t i = 0; i < power; i++) { - result *= base; - } - return result; -} - parsing_error::parsing_error( const std::string& message, std::string_view filename, @@ -35,6 +24,26 @@ parsing_error::parsing_error( this->source = source.substr(linestart, end - linestart); } +parsing_error::parsing_error( + const std::string& message, + std::string&& filename, + std::string&& source, + uint pos, + uint line, + uint linestart +) + : std::runtime_error(message), + filename(std::move(filename)), + pos(pos), + line(line), + linestart(linestart) { + size_t end = source.find("\n", linestart); + if (end == std::string::npos) { + end = source.length(); + } + this->source = source.substr(linestart, end - linestart); +} + std::string parsing_error::errorLog() const { std::stringstream ss; uint linepos = pos - linestart; @@ -48,377 +57,3 @@ std::string parsing_error::errorLog() const { ss << "^"; return ss.str(); } - -BasicParser::BasicParser(std::string_view file, std::string_view source) - : filename(file), source(source) { -} - -void BasicParser::skipWhitespace() { - while (hasNext()) { - char next = source[pos]; - if (next == '\n') { - line++; - linestart = ++pos; - continue; - } - if (is_whitespace(next)) { - pos++; - } else { - break; - } - } -} - -void BasicParser::skip(size_t n) { - n = std::min(n, source.length() - pos); - - for (size_t i = 0; i < n; i++) { - char next = source[pos++]; - if (next == '\n') { - line++; - linestart = pos; - } - } -} - -void BasicParser::skipLine() { - while (hasNext()) { - if (source[pos] == '\n') { - pos++; - linestart = pos; - line++; - break; - } - pos++; - } -} - -bool BasicParser::skipTo(const std::string& substring) { - size_t idx = source.find(substring, pos); - if (idx == std::string::npos) { - skip(source.length() - pos); - return false; - } else { - skip(idx - pos); - return true; - } -} - -bool BasicParser::hasNext() { - return pos < source.length(); -} - -size_t BasicParser::remain() const { - return source.length() - pos; -} - -bool BasicParser::isNext(const std::string& substring) { - if (source.length() - pos < substring.length()) { - return false; - } - return source.substr(pos, substring.length()) == substring; -} - -char BasicParser::nextChar() { - if (!hasNext()) { - throw error("unexpected end"); - } - return source[pos++]; -} - -void BasicParser::expect(char expected) { - char c = peek(); - if (c != expected) { - throw error("'" + std::string({expected}) + "' expected"); - } - pos++; -} - -void BasicParser::expect(const std::string& substring) { - if (substring.empty()) return; - for (uint i = 0; i < substring.length(); i++) { - if (source.length() <= pos + i || source[pos + i] != substring[i]) { - throw error(util::quote(substring) + " expected"); - } - } - pos += substring.length(); -} - -void BasicParser::expectNewLine() { - while (hasNext()) { - char next = source[pos]; - if (next == '\n') { - line++; - linestart = ++pos; - return; - } - if (is_whitespace(next)) { - pos++; - } else { - throw error("line separator expected"); - } - } -} - -void BasicParser::goBack(size_t count) { - if (pos < count) { - throw std::runtime_error("pos < jump"); - } - if (pos) { - pos -= count; - } -} - -void BasicParser::reset() { - pos = 0; -} - -char BasicParser::peekInLine() { - while (hasNext()) { - char next = source[pos]; - if (next == '\n') { - return next; - } - if (is_whitespace(next)) { - pos++; - } else { - break; - } - } - if (pos >= source.length()) { - throw error("unexpected end"); - } - return source[pos]; -} - -char BasicParser::peek() { - skipWhitespace(); - if (pos >= source.length()) { - throw error("unexpected end"); - } - return source[pos]; -} - -char BasicParser::peekNoJump() { - if (pos >= source.length()) { - throw error("unexpected end"); - } - return source[pos]; -} - -std::string_view BasicParser::readUntil(char c) { - int start = pos; - while (hasNext() && source[pos] != c) { - pos++; - } - return source.substr(start, pos - start); -} - -std::string_view BasicParser::readUntil(std::string_view s, bool nothrow) { - int start = pos; - size_t found = source.find(s, pos); - if (found == std::string::npos) { - if (nothrow) { - pos = source.size(); - return source.substr(start); - } - throw error(util::quote(std::string(s))+" expected"); - } - skip(found - pos); - return source.substr(start, pos - start); -} - -std::string_view BasicParser::readUntilWhitespace() { - int start = pos; - while (hasNext() && !is_whitespace(source[pos])) { - pos++; - } - return source.substr(start, pos - start); -} - -std::string_view BasicParser::readUntilEOL() { - int start = pos; - while (hasNext() && source[pos] != '\r' && source[pos] != '\n') { - pos++; - } - return source.substr(start, pos - start); -} - -std::string BasicParser::parseName() { - char c = peek(); - if (!is_identifier_start(c)) { - throw error("identifier expected"); - } - int start = pos; - while (hasNext() && is_identifier_part(source[pos])) { - pos++; - } - return std::string(source.substr(start, pos - start)); -} - -std::string BasicParser::parseXmlName() { - char c = peek(); - if (!is_json_identifier_start(c)) { - throw error("identifier expected"); - } - int start = pos; - while (hasNext() && is_json_identifier_part(source[pos])) { - pos++; - } - return std::string(source.substr(start, pos - start)); -} - -int64_t BasicParser::parseSimpleInt(int base) { - char c = peek(); - int index = hexchar2int(c); - if (index == -1 || index >= base) { - throw error("invalid number literal"); - } - int64_t value = index; - pos++; - while (hasNext()) { - c = source[pos]; - while (c == '_') { - c = source[++pos]; - } - index = hexchar2int(c); - if (index == -1 || index >= base) { - return value; - } - value *= base; - value += index; - pos++; - } - return value; -} - -dv::value BasicParser::parseNumber() { - switch (peek()) { - case '-': - skip(1); - return parseNumber(-1); - case '+': - skip(1); - return parseNumber(1); - default: - return parseNumber(1); - } -} - -dv::value BasicParser::parseNumber(int sign) { - char c = peek(); - int base = 10; - if (c == '0' && pos + 1 < source.length() && - (base = is_box(source[pos + 1])) != 10) { - pos += 2; - return parseSimpleInt(base); - } else if (c == 'i' && pos + 2 < source.length() && source[pos + 1] == 'n' && source[pos + 2] == 'f') { - pos += 3; - return INFINITY * sign; - } else if (c == 'n' && pos + 2 < source.length() && source[pos + 1] == 'a' && source[pos + 2] == 'n') { - pos += 3; - return NAN * sign; - } - int64_t value = parseSimpleInt(base); - if (!hasNext()) { - return value * sign; - } - c = source[pos]; - if (c == 'e' || c == 'E') { - pos++; - int s = 1; - if (peek() == '-') { - s = -1; - pos++; - } else if (peek() == '+') { - pos++; - } - return sign * value * power(10.0, s * parseSimpleInt(10)); - } - if (c == '.') { - pos++; - int64_t expo = 1; - while (hasNext() && source[pos] == '0') { - expo *= 10; - pos++; - } - int64_t afterdot = 0; - if (hasNext() && is_digit(source[pos])) { - afterdot = parseSimpleInt(10); - } - expo *= power(10, fmax(0, log10(afterdot) + 1)); - c = source[pos]; - - double dvalue = (value + (afterdot / (double)expo)); - if (c == 'e' || c == 'E') { - pos++; - int s = 1; - if (peek() == '-') { - s = -1; - pos++; - } else if (peek() == '+') { - pos++; - } - return sign * dvalue * power(10.0, s * parseSimpleInt(10)); - } - return sign * dvalue; - } - return sign * value; -} - -std::string BasicParser::parseString(char quote, bool closeRequired) { - std::stringstream ss; - while (hasNext()) { - char c = source[pos]; - if (c == quote) { - pos++; - return ss.str(); - } - if (c == '\\') { - pos++; - c = nextChar(); - if (c >= '0' && c <= '7') { - pos--; - ss << (char)parseSimpleInt(8); - continue; - } - if (c == 'u') { - int codepoint = parseSimpleInt(16); - ubyte bytes[4]; - int size = util::encode_utf8(codepoint, bytes); - ss.write(reinterpret_cast(bytes), size); - continue; - } - switch (c) { - case 'n': ss << '\n'; break; - case 'r': ss << '\r'; break; - case 'b': ss << '\b'; break; - case 't': ss << '\t'; break; - case 'f': ss << '\f'; break; - case '\'': ss << '\''; break; - case '"': ss << '"'; break; - case '\\': ss << '\\'; break; - case '/': ss << '/'; break; - case '\n': continue; - default: - throw error( - "'\\" + std::string({c}) + "' is an illegal escape" - ); - } - continue; - } - if (c == '\n' && closeRequired) { - throw error("non-closed string literal"); - } - ss << c; - pos++; - } - if (closeRequired) { - throw error("unexpected end"); - } - return ss.str(); -} - -parsing_error BasicParser::error(const std::string& message) { - return parsing_error(message, filename, source, pos, line, linestart); -} diff --git a/src/coders/commons.hpp b/src/coders/commons.hpp index 9251d57d..99047499 100644 --- a/src/coders/commons.hpp +++ b/src/coders/commons.hpp @@ -3,24 +3,8 @@ #include #include -#include "data/dv.hpp" #include "typedefs.hpp" -inline int is_box(int c) { - switch (c) { - case 'B': - case 'b': - return 2; - case 'O': - case 'o': - return 8; - case 'X': - case 'x': - return 16; - } - return 10; -} - inline bool is_digit(int c) { return (c >= '0' && c <= '9'); } @@ -66,7 +50,7 @@ public: uint pos; uint line; uint linestart; - + parsing_error( const std::string& message, std::string_view filename, @@ -75,47 +59,14 @@ public: uint line, uint linestart ); + + parsing_error( + const std::string& message, + std::string&& filename, + std::string&& source, + uint pos, + uint line, + uint linestart + ); std::string errorLog() const; }; - -class BasicParser { -protected: - std::string_view filename; - std::string_view source; - uint pos = 0; - uint line = 1; - uint linestart = 0; - - virtual void skipWhitespace(); - void skip(size_t n); - void skipLine(); - bool skipTo(const std::string& substring); - void expect(char expected); - void expect(const std::string& substring); - bool isNext(const std::string& substring); - void expectNewLine(); - void goBack(size_t count = 1); - void reset(); - - int64_t parseSimpleInt(int base); - dv::value parseNumber(int sign); - dv::value parseNumber(); - std::string parseString(char chr, bool closeRequired = true); - - parsing_error error(const std::string& message); -public: - std::string_view readUntil(char c); - std::string_view readUntil(std::string_view s, bool nothrow); - std::string_view readUntilWhitespace(); - std::string_view readUntilEOL(); - std::string parseName(); - std::string parseXmlName(); - bool hasNext(); - size_t remain() const; - char peek(); - char peekInLine(); - char peekNoJump(); - char nextChar(); - - BasicParser(std::string_view file, std::string_view source); -}; diff --git a/src/coders/json.cpp b/src/coders/json.cpp index 0990ff47..56356a40 100644 --- a/src/coders/json.cpp +++ b/src/coders/json.cpp @@ -7,12 +7,12 @@ #include #include "util/stringutil.hpp" -#include "commons.hpp" +#include "BasicParser.hpp" using namespace json; namespace { - class Parser : BasicParser { + class Parser : BasicParser { dv::value parseList(); dv::value parseObject(); dv::value parseValue(); diff --git a/src/coders/lua_parsing.cpp b/src/coders/lua_parsing.cpp index b3492bae..0840c369 100644 --- a/src/coders/lua_parsing.cpp +++ b/src/coders/lua_parsing.cpp @@ -2,18 +2,18 @@ #include -#include "commons.hpp" +#include "BasicParser.hpp" using namespace lua; using namespace devtools; -static std::set keywords { - "and", "break", "do", "else", "elseif", "end", "false", "for", "function", - "if", "in", "local", "nil", "not", "or", "repeat", "return", "then", "true", - "until", "while" +static std::set keywords { + L"and", L"break", L"do", L"else", L"elseif", L"end", L"false", L"for", L"function", + L"if", L"in", L"local", L"nil", L"not", L"or", L"repeat", L"return", L"then", L"true", + L"until", L"while" }; -bool lua::is_lua_keyword(std::string_view view) { +static bool is_lua_keyword(std::wstring_view view) { return keywords.find(view) != keywords.end(); } @@ -31,14 +31,14 @@ inline bool is_lua_operator_start(int c) { || c == '.'; } -class Tokenizer : BasicParser { +class Tokenizer : BasicParser { std::vector tokens; public: - Tokenizer(std::string_view file, std::string_view source) + Tokenizer(std::string_view file, std::wstring_view source) : BasicParser(file, source) { } - std::string parseLuaName() { + std::wstring parseLuaName() { char c = peek(); if (!is_identifier_start(c)) { throw error("identifier expected"); @@ -47,7 +47,7 @@ public: while (hasNext() && is_identifier_part(source[pos])) { pos++; } - return std::string(source.substr(start, pos - start)); + return std::wstring(source.substr(start, pos - start)); } inline Location currentLocation() const { @@ -58,7 +58,7 @@ public: } void emitToken( - TokenTag tag, std::string name, Location start, bool standalone=false + TokenTag tag, std::wstring name, Location start, bool standalone=false ) { tokens.emplace_back( tag, @@ -70,28 +70,28 @@ public: } /// @brief Get next operator token without checking operator for existing - std::string parseOperator() { + std::wstring parseOperator() { int start = pos; - char first = peek(); + wchar_t first = peek(); switch (first) { case '#': case '+': case '/': case '*': case '^': case '%': skip(1); - return std::string({first}); + return std::wstring({first}); case '-': skip(1); if (hasNext() && peekNoJump() == '-') { skip(1); - return "--"; + return L"--"; } - return std::string({first}); + return std::wstring({first}); } skip(1); char second = peekNoJump(); if ((first == '=' && second == '=') || (first == '~' && second == '=') || (first == '<' && second == '=') || (first == '>' && second == '=')) { skip(1); - return std::string(source.substr(start, pos - start)); + return std::wstring(source.substr(start, pos - start)); } if (first == '.' && second == '.') { skip(1); @@ -99,7 +99,7 @@ public: skip(1); } } - return std::string(source.substr(start, pos - start)); + return std::wstring(source.substr(start, pos - start)); } std::vector tokenize() { @@ -109,7 +109,7 @@ public: if (!hasNext()) { continue; } - char c = peek(); + wchar_t c = peek(); auto start = currentLocation(); if (is_lua_identifier_start(c)) { auto name = parseLuaName(); @@ -130,33 +130,37 @@ public: } catch (const parsing_error& err) {} auto literal = source.substr(start.pos, pos - start.pos); - emitToken(tag, std::string(literal), start); + emitToken(tag, std::wstring(literal), start); continue; } switch (c) { case '(': case '[': case '{': - if (isNext("[==[")) { - auto string = readUntil("]==]", true); + if (isNext(L"[==[")) { + auto string = readUntil(L"]==]", true); skip(4); - emitToken(TokenTag::COMMENT, std::string(string)+"]==]", start); + emitToken( + TokenTag::COMMENT, + std::wstring(string) + L"]==]", + start + ); continue; - } else if (isNext("[[")) { + } else if (isNext(L"[[")) { skip(2); - auto string = readUntil("]]", true); + auto string = readUntil(L"]]", true); skip(2); - emitToken(TokenTag::STRING, std::string(string), start); + emitToken(TokenTag::STRING, std::wstring(string), start); continue; } - emitToken(TokenTag::OPEN_BRACKET, std::string({c}), start, true); + emitToken(TokenTag::OPEN_BRACKET, std::wstring({c}), start, true); continue; case ')': case ']': case '}': - emitToken(TokenTag::CLOSE_BRACKET, std::string({c}), start, true); + emitToken(TokenTag::CLOSE_BRACKET, std::wstring({c}), start, true); continue; case ',': - emitToken(TokenTag::COMMA, std::string({c}), start, true); + emitToken(TokenTag::COMMA, std::wstring({c}), start, true); continue; case ';': - emitToken(TokenTag::SEMICOLON, std::string({c}), start, true); + emitToken(TokenTag::SEMICOLON, std::wstring({c}), start, true); continue; case '\'': case '"': { skip(1); @@ -168,9 +172,9 @@ public: } if (is_lua_operator_start(c)) { auto text = parseOperator(); - if (text == "--") { + if (text == L"--") { auto string = readUntilEOL(); - emitToken(TokenTag::COMMENT, std::string(string), start); + emitToken(TokenTag::COMMENT, std::wstring(string), start); skipLine(); continue; } @@ -178,12 +182,12 @@ public: continue; } auto text = readUntilWhitespace(); - emitToken(TokenTag::UNEXPECTED, std::string(text), start); + emitToken(TokenTag::UNEXPECTED, std::wstring(text), start); } return std::move(tokens); } }; -std::vector lua::tokenize(std::string_view file, std::string_view source) { +std::vector lua::tokenize(std::string_view file, std::wstring_view source) { return Tokenizer(file, source).tokenize(); } diff --git a/src/coders/lua_parsing.hpp b/src/coders/lua_parsing.hpp index 0054e4d0..2d5e6349 100644 --- a/src/coders/lua_parsing.hpp +++ b/src/coders/lua_parsing.hpp @@ -6,9 +6,7 @@ #include "devtools/syntax.hpp" namespace lua { - bool is_lua_keyword(std::string_view view); - std::vector tokenize( - std::string_view file, std::string_view source + std::string_view file, std::wstring_view source ); } diff --git a/src/coders/obj.cpp b/src/coders/obj.cpp index 182415df..0eed9bf6 100644 --- a/src/coders/obj.cpp +++ b/src/coders/obj.cpp @@ -1,11 +1,11 @@ #include "obj.hpp" #include "graphics/commons/Model.hpp" -#include "commons.hpp" +#include "BasicParser.hpp" using namespace model; -class ObjParser : BasicParser { +class ObjParser : BasicParser { std::vector coords {{0, 0, 0}}; std::vector uvs {{0, 0}}; std::vector normals {{0, 1, 0}}; diff --git a/src/coders/toml.cpp b/src/coders/toml.cpp index 3896bcec..8df231a2 100644 --- a/src/coders/toml.cpp +++ b/src/coders/toml.cpp @@ -9,11 +9,11 @@ #include "data/setting.hpp" #include "io/settings_io.hpp" #include "util/stringutil.hpp" -#include "commons.hpp" +#include "BasicParser.hpp" using namespace toml; -class TomlReader : BasicParser { +class TomlReader : BasicParser { dv::value root; void skipWhitespace() override { diff --git a/src/coders/xml.cpp b/src/coders/xml.cpp index 84cc9f37..9ed51e0c 100644 --- a/src/coders/xml.cpp +++ b/src/coders/xml.cpp @@ -6,6 +6,7 @@ #include #include "util/stringutil.hpp" +#include "coders/BasicParser.hpp" using namespace xml; @@ -191,7 +192,7 @@ inline bool is_xml_identifier_part(char c) { } namespace { -class Parser : BasicParser { +class Parser : BasicParser { std::unique_ptr document; std::unique_ptr parseOpenTag() { diff --git a/src/devtools/syntax.hpp b/src/devtools/syntax.hpp index 6c7d4b15..9a28ff05 100644 --- a/src/devtools/syntax.hpp +++ b/src/devtools/syntax.hpp @@ -16,11 +16,11 @@ namespace devtools { struct Token { TokenTag tag; - std::string text; + std::wstring text; Location start; Location end; - Token(TokenTag tag, std::string text, Location start, Location end) + Token(TokenTag tag, std::wstring text, Location start, Location end) : tag(tag), text(std::move(text)), start(std::move(start)), diff --git a/src/devtools/syntax_highlighting.cpp b/src/devtools/syntax_highlighting.cpp index 2963daf2..7aa95a23 100644 --- a/src/devtools/syntax_highlighting.cpp +++ b/src/devtools/syntax_highlighting.cpp @@ -56,7 +56,7 @@ static std::unique_ptr build_styles( } std::unique_ptr devtools::syntax_highlight( - const std::string& lang, std::string_view source + const std::string& lang, std::wstring_view source ) { try { if (lang == "lua") { diff --git a/src/devtools/syntax_highlighting.hpp b/src/devtools/syntax_highlighting.hpp index 18db6389..e30b0084 100644 --- a/src/devtools/syntax_highlighting.hpp +++ b/src/devtools/syntax_highlighting.hpp @@ -11,6 +11,6 @@ namespace devtools { }; std::unique_ptr syntax_highlight( - const std::string& lang, std::string_view source + const std::string& lang, std::wstring_view source ); } diff --git a/src/frontend/locale.cpp b/src/frontend/locale.cpp index d199cb49..669d0b4c 100644 --- a/src/frontend/locale.cpp +++ b/src/frontend/locale.cpp @@ -3,7 +3,7 @@ #include #include "coders/json.hpp" -#include "coders/commons.hpp" +#include "coders/BasicParser.hpp" #include "content/ContentPack.hpp" #include "io/io.hpp" #include "util/stringutil.hpp" @@ -39,7 +39,7 @@ const std::string& langs::Lang::getId() const { /// @brief Language key-value txt files parser namespace { - class Reader : BasicParser { + class Reader : BasicParser { void skipWhitespace() override { BasicParser::skipWhitespace(); if (hasNext() && source[pos] == '#') { diff --git a/src/graphics/ui/elements/TextBox.cpp b/src/graphics/ui/elements/TextBox.cpp index c7933c7e..791dcde8 100644 --- a/src/graphics/ui/elements/TextBox.cpp +++ b/src/graphics/ui/elements/TextBox.cpp @@ -588,9 +588,7 @@ void TextBox::stepDefaultUp(bool shiftPressed, bool breakSelection) { void TextBox::refreshSyntax() { if (!syntax.empty()) { - if (auto styles = devtools::syntax_highlight( - syntax, util::wstr2str_utf8(input) - )) { + if (auto styles = devtools::syntax_highlight(syntax, input)) { label->setStyles(std::move(styles)); } } diff --git a/src/logic/CommandsInterpreter.cpp b/src/logic/CommandsInterpreter.cpp index c13e4895..40851072 100644 --- a/src/logic/CommandsInterpreter.cpp +++ b/src/logic/CommandsInterpreter.cpp @@ -3,7 +3,7 @@ #include #include -#include "coders/commons.hpp" +#include "coders/BasicParser.hpp" #include "util/stringutil.hpp" using namespace cmd; @@ -17,7 +17,7 @@ inline bool is_cmd_identifier_start(char c) { return (is_identifier_start(c) || c == '.' || c == '$'); } -class CommandParser : BasicParser { +class CommandParser : BasicParser { std::string parseIdentifier(bool allowColon) { char c = peek(); if (!is_identifier_start(c) && c != '$') { diff --git a/src/util/stringutil.cpp b/src/util/stringutil.cpp index 5242e01a..53f77239 100644 --- a/src/util/stringutil.cpp +++ b/src/util/stringutil.cpp @@ -88,6 +88,18 @@ std::wstring util::rfill(std::wstring s, uint length, wchar_t c) { return ss.str(); } +static size_t length_utf8_codepoint(uint32_t c) { + if (c < 0x80) { + return 1; + } else if (c < 0x0800) { + return 2; + } else if (c < 0x010000) { + return 3; + } else { + return 4; + } +} + uint util::encode_utf8(uint32_t c, ubyte* bytes) { if (c < 0x80) { bytes[0] = ((c >> 0) & 0x7F) | 0x00; @@ -179,8 +191,16 @@ size_t util::length_utf8(std::string_view s) { return length; } +size_t util::length_utf8(std::wstring_view s) { + size_t length = 0; + for (size_t i = 0; i < s.length(); i++) { + length += length_utf8_codepoint(s[i]); + } + return length; +} + template -std::string xstr2str_utf8(const std::basic_string& xs) { +std::string xstr2str_utf8(std::basic_string_view xs) { std::vector chars; ubyte buffer[4]; for (C xc : xs) { @@ -193,16 +213,16 @@ std::string xstr2str_utf8(const std::basic_string& xs) { return std::string(chars.data(), chars.size()); } -std::string util::wstr2str_utf8(const std::wstring& ws) { +std::string util::wstr2str_utf8(std::wstring_view ws) { return xstr2str_utf8(ws); } -std::string util::u32str2str_utf8(const std::u32string& ws) { +std::string util::u32str2str_utf8(std::u32string_view ws) { return xstr2str_utf8(ws); } template -std::basic_string str2xstr_utf8(const std::string& s) { +std::basic_string str2xstr_utf8(std::string_view s) { std::vector chars; size_t pos = 0; uint size = 0; @@ -213,7 +233,7 @@ std::basic_string str2xstr_utf8(const std::string& s) { return std::basic_string(chars.data(), chars.size()); } -std::wstring util::str2wstr_utf8(const std::string& s) { +std::wstring util::str2wstr_utf8(std::string_view s) { return str2xstr_utf8(s); } diff --git a/src/util/stringutil.hpp b/src/util/stringutil.hpp index ed3061f5..522ad2ca 100644 --- a/src/util/stringutil.hpp +++ b/src/util/stringutil.hpp @@ -22,23 +22,33 @@ namespace util { /// @brief Encode raw wstring to UTF-8 /// @param ws source raw wstring /// @return new UTF-8 encoded string - std::string wstr2str_utf8(const std::wstring& ws); + std::string wstr2str_utf8(std::wstring_view ws); /// @brief Decode UTF-8 string /// @param s source encoded string /// @return new raw decoded wstring - std::wstring str2wstr_utf8(const std::string& s); + std::wstring str2wstr_utf8(std::string_view s); /// @brief Encode raw u32string to UTF-8 /// @param ws source raw wstring /// @return new UTF-8 encoded string - std::string u32str2str_utf8(const std::u32string& ws); + std::string u32str2str_utf8(std::u32string_view ws); /// @brief Decode UTF-8 string /// @param s source encoded string /// @return new raw decoded u32string std::u32string str2u32str_utf8(const std::string& s); + inline std::string str2str_utf8(std::string_view s) { + return std::string(s); + } + inline std::string str2str_utf8(std::wstring_view s) { + return wstr2str_utf8(s); + } + inline std::string str2str_utf8(std::u32string_view s) { + return u32str2str_utf8(s); + } + /// @brief Calculated length of UTF-8 encoded string that fits into maxSize /// @param s source UTF-8 encoded string view /// @param maxSize max encoded string length after crop @@ -49,6 +59,8 @@ namespace util { /// @param s source encoded string /// @return unicode string length (number of codepoints) size_t length_utf8(std::string_view s); + + size_t length_utf8(std::wstring_view s); bool is_integer(const std::string& text); bool is_integer(const std::wstring& text); diff --git a/test/coders/lua_parsing.cpp b/test/coders/lua_parsing.cpp index 17492d7a..86f3f415 100644 --- a/test/coders/lua_parsing.cpp +++ b/test/coders/lua_parsing.cpp @@ -13,9 +13,11 @@ TEST(lua_parsing, Tokenizer) { auto filename = "res:scripts/stdlib.lua"; auto source = io::read_string(filename); try { - auto tokens = lua::tokenize(filename, source); + auto tokens = lua::tokenize(filename, util::str2wstr_utf8(source)); for (const auto& token : tokens) { - std::cout << (int)token.tag << " " << util::quote(token.text) << std::endl; + std::cout << (int)token.tag << " " + << util::quote(util::wstr2str_utf8(token.text)) + << std::endl; } } catch (const parsing_error& err) { std::cerr << err.errorLog() << std::endl;