From 52e62bbf9573d23729e61c307a0fabc2a549d828 Mon Sep 17 00:00:00 2001 From: MihailRis Date: Fri, 25 Oct 2024 17:22:54 +0300 Subject: [PATCH 1/7] add utf8.encode --- src/logic/scripting/lua/libs/api_lua.hpp | 1 + src/logic/scripting/lua/libs/libutf8.cpp | 28 ++++++++++++++++++++++++ src/logic/scripting/lua/lua_engine.cpp | 1 + 3 files changed, 30 insertions(+) create mode 100644 src/logic/scripting/lua/libs/libutf8.cpp diff --git a/src/logic/scripting/lua/libs/api_lua.hpp b/src/logic/scripting/lua/libs/api_lua.hpp index 2ac9c1e8..f9c8b3f8 100644 --- a/src/logic/scripting/lua/libs/api_lua.hpp +++ b/src/logic/scripting/lua/libs/api_lua.hpp @@ -35,6 +35,7 @@ extern const luaL_Reg playerlib[]; extern const luaL_Reg quatlib[]; // quat.cpp extern const luaL_Reg timelib[]; extern const luaL_Reg tomllib[]; +extern const luaL_Reg utf8lib[]; extern const luaL_Reg vec2lib[]; // vecn.cpp extern const luaL_Reg vec3lib[]; // vecn.cpp extern const luaL_Reg vec4lib[]; // vecn.cpp diff --git a/src/logic/scripting/lua/libs/libutf8.cpp b/src/logic/scripting/lua/libs/libutf8.cpp new file mode 100644 index 00000000..792f2513 --- /dev/null +++ b/src/logic/scripting/lua/libs/libutf8.cpp @@ -0,0 +1,28 @@ +#include "api_lua.hpp" + +#include + +#include "../lua_custom_types.hpp" +#include "util/stringutil.hpp" + +int l_encode(lua::State* L) { + std::string string = lua::require_string(L, 1); + if (lua::toboolean(L, 2)) { + lua::createtable(L, string.length(), 0); + for (size_t i = 0; i < string.length(); i++) { + lua::pushinteger(L, string[i]); + lua::rawseti(L, i+1); + } + } else { + lua::newuserdata(L, string.length()); + auto bytearray = lua::touserdata(L, -1); + bytearray->data().reserve(string.length()); + std::memcpy(bytearray->data().data(), string.data(), string.length()); + } + return 1; +} + +const luaL_Reg utf8lib[] = { + {"encode", lua::wrap}, + {NULL, NULL} +}; diff --git a/src/logic/scripting/lua/lua_engine.cpp b/src/logic/scripting/lua/lua_engine.cpp index fd40e612..fe3e75b5 100644 --- a/src/logic/scripting/lua/lua_engine.cpp +++ b/src/logic/scripting/lua/lua_engine.cpp @@ -51,6 +51,7 @@ static void create_libs(State* L, StateType stateType) { openlib(L, "quat", quatlib); openlib(L, "time", timelib); openlib(L, "toml", tomllib); + openlib(L, "utf8", utf8lib); openlib(L, "vec2", vec2lib); openlib(L, "vec3", vec3lib); openlib(L, "vec4", vec4lib); From 52ed1abf1c9d1d56fc9610d5d67d476ee2efbc2d Mon Sep 17 00:00:00 2001 From: MihailRis Date: Fri, 25 Oct 2024 22:56:11 +0300 Subject: [PATCH 2/7] add utf8.decode --- src/logic/scripting/lua/libs/libutf8.cpp | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/src/logic/scripting/lua/libs/libutf8.cpp b/src/logic/scripting/lua/libs/libutf8.cpp index 792f2513..4cfedd03 100644 --- a/src/logic/scripting/lua/libs/libutf8.cpp +++ b/src/logic/scripting/lua/libs/libutf8.cpp @@ -5,7 +5,7 @@ #include "../lua_custom_types.hpp" #include "util/stringutil.hpp" -int l_encode(lua::State* L) { +static int l_encode(lua::State* L) { std::string string = lua::require_string(L, 1); if (lua::toboolean(L, 2)) { lua::createtable(L, string.length(), 0); @@ -22,7 +22,25 @@ int l_encode(lua::State* L) { return 1; } +static int l_decode(lua::State* L) { + if (lua::istable(L, 1)) { + size_t size = lua::objlen(L, 1); + util::Buffer buffer(size); + return lua::pushstring(L, std::string(buffer.data(), size)); + } else if (auto bytes = lua::touserdata(L, 1)) { + return lua::pushstring( + L, + std::string( + reinterpret_cast(bytes->data().data()), + bytes->data().size() + ) + ); + } + return 1; +} + const luaL_Reg utf8lib[] = { {"encode", lua::wrap}, + {"decode", lua::wrap}, {NULL, NULL} }; From f3181dee88500a130fe5b6b5f66a8c55f92798d9 Mon Sep 17 00:00:00 2001 From: MihailRis Date: Sat, 26 Oct 2024 10:31:45 +0300 Subject: [PATCH 3/7] add util::length_utf8 --- src/util/stringutil.cpp | 12 +++++++++++- src/util/stringutil.hpp | 5 +++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/src/util/stringutil.cpp b/src/util/stringutil.cpp index c104bd5e..12936343 100644 --- a/src/util/stringutil.cpp +++ b/src/util/stringutil.cpp @@ -128,7 +128,7 @@ inline uint utf8_len(ubyte cp) { if ((cp & 0xF8) == 0xF0) { return 4; } - return 0; + throw std::runtime_error("utf8 decode error"); } uint32_t util::decode_utf8(uint& size, const char* chr) { @@ -156,6 +156,16 @@ size_t util::crop_utf8(std::string_view s, size_t maxSize) { return pos; } +size_t util::length_utf8(std::string_view s) { + size_t length = 0; + size_t pos = 0; + while (pos < s.length()) { + pos += utf8_len(s[pos]); + length++; + } + return length; +} + template std::string xstr2str_utf8(const std::basic_string& xs) { std::vector chars; diff --git a/src/util/stringutil.hpp b/src/util/stringutil.hpp index c7bc2fb5..f584a53c 100644 --- a/src/util/stringutil.hpp +++ b/src/util/stringutil.hpp @@ -44,6 +44,11 @@ namespace util { /// @param maxSize max encoded string length after crop /// @return cropped string size (less or equal to maxSize) size_t crop_utf8(std::string_view s, size_t maxSize); + + /// @brief Measure utf8-encoded string length + /// @param s source encoded string + /// @return unicode string length (number of codepoints) + size_t length_utf8(std::string_view s); bool is_integer(const std::string& text); bool is_integer(const std::wstring& text); From 3dc334e778262a6c7f7614a62c78e27e2d4cc5d3 Mon Sep 17 00:00:00 2001 From: MihailRis Date: Sat, 26 Oct 2024 10:32:34 +0300 Subject: [PATCH 4/7] add utf8.length, utf8.codepoint --- src/logic/scripting/lua/libs/libutf8.cpp | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/src/logic/scripting/lua/libs/libutf8.cpp b/src/logic/scripting/lua/libs/libutf8.cpp index 4cfedd03..80af5d5d 100644 --- a/src/logic/scripting/lua/libs/libutf8.cpp +++ b/src/logic/scripting/lua/libs/libutf8.cpp @@ -6,11 +6,11 @@ #include "util/stringutil.hpp" static int l_encode(lua::State* L) { - std::string string = lua::require_string(L, 1); + std::string_view string = lua::require_string(L, 1); if (lua::toboolean(L, 2)) { lua::createtable(L, string.length(), 0); for (size_t i = 0; i < string.length(); i++) { - lua::pushinteger(L, string[i]); + lua::pushinteger(L, string[i] & 0xFF); lua::rawseti(L, i+1); } } else { @@ -39,8 +39,24 @@ static int l_decode(lua::State* L) { return 1; } +static int l_length(lua::State* L) { + auto string = lua::require_string(L, 1); + return lua::pushinteger(L, util::length_utf8(string)); +} + +static int l_codepoint(lua::State* L) { + std::string_view string = lua::require_string(L, 1); + if (string.empty()) { + return lua::pushinteger(L, 0); + } + uint size; + return lua::pushinteger(L, util::decode_utf8(size, string.data())); +} + const luaL_Reg utf8lib[] = { {"encode", lua::wrap}, {"decode", lua::wrap}, + {"length", lua::wrap}, + {"codepoint", lua::wrap}, {NULL, NULL} }; From ce7698e9ee22b4c149699766ad7ff1411d568f88 Mon Sep 17 00:00:00 2001 From: MihailRis Date: Sat, 26 Oct 2024 10:54:33 +0300 Subject: [PATCH 5/7] rename utf8.encode to utf8.tobytes, utf8.decode to utf8.tostring --- src/logic/scripting/lua/libs/libutf8.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/logic/scripting/lua/libs/libutf8.cpp b/src/logic/scripting/lua/libs/libutf8.cpp index 80af5d5d..f273284c 100644 --- a/src/logic/scripting/lua/libs/libutf8.cpp +++ b/src/logic/scripting/lua/libs/libutf8.cpp @@ -54,8 +54,8 @@ static int l_codepoint(lua::State* L) { } const luaL_Reg utf8lib[] = { - {"encode", lua::wrap}, - {"decode", lua::wrap}, + {"tobytes", lua::wrap}, + {"tostring", lua::wrap}, {"length", lua::wrap}, {"codepoint", lua::wrap}, {NULL, NULL} From bd60d05c3ae631df8fa56129b31aae2242b167ab Mon Sep 17 00:00:00 2001 From: MihailRis Date: Sat, 26 Oct 2024 11:07:48 +0300 Subject: [PATCH 6/7] add utf8 library docs --- doc/en/scripting.md | 1 + doc/en/scripting/builtins/libutf8.md | 18 ++++++++++++++++++ doc/ru/scripting.md | 1 + doc/ru/scripting/builtins/libutf8.md | 18 ++++++++++++++++++ 4 files changed, 38 insertions(+) create mode 100644 doc/en/scripting/builtins/libutf8.md create mode 100644 doc/ru/scripting/builtins/libutf8.md diff --git a/doc/en/scripting.md b/doc/en/scripting.md index 2ed46bdb..d1fdc645 100644 --- a/doc/en/scripting.md +++ b/doc/en/scripting.md @@ -21,6 +21,7 @@ Subsections: - [player](scripting/builtins/libplayer.md) - [quat](scripting/builtins/libquat.md) - [time](scripting/builtins/libtime.md) + - [utf8](scripting/builtins/libutf8.md) - [vec2, vec3, vec4](scripting/builtins/libvecn.md) - [world](scripting/builtins/libworld.md) - [Module core:bit_converter](scripting/modules/core_bit_converter.md) diff --git a/doc/en/scripting/builtins/libutf8.md b/doc/en/scripting/builtins/libutf8.md new file mode 100644 index 00000000..55851f88 --- /dev/null +++ b/doc/en/scripting/builtins/libutf8.md @@ -0,0 +1,18 @@ +# *utf8* library + +The library provides functions for working with UTF-8. + +```lua +-- Converts a UTF-8 string to a Bytearray or an array of numbers if +-- the second argument is true +utf8.tobytes(text: str, [optional] usetable=false) -> Bytearray|table + +-- Converts a Bytearray or an array of numbers to a UTF-8 string +utf8.tostring(bytes: Bytearray|table) -> str + +-- Returns the length of a Unicode string +utf8.length(text: str) -> int + +-- Returns the code of the first character of the string +utf8.codepoint(chars: str) -> int +``` diff --git a/doc/ru/scripting.md b/doc/ru/scripting.md index 1f582d46..0900788e 100644 --- a/doc/ru/scripting.md +++ b/doc/ru/scripting.md @@ -21,6 +21,7 @@ - [player](scripting/builtins/libplayer.md) - [quat](scripting/builtins/libquat.md) - [time](scripting/builtins/libtime.md) + - [utf8](scripting/builtins/libutf8.md) - [vec2, vec3, vec4](scripting/builtins/libvecn.md) - [world](scripting/builtins/libworld.md) - [Модуль core:bit_converter](scripting/modules/core_bit_converter.md) diff --git a/doc/ru/scripting/builtins/libutf8.md b/doc/ru/scripting/builtins/libutf8.md new file mode 100644 index 00000000..41805b38 --- /dev/null +++ b/doc/ru/scripting/builtins/libutf8.md @@ -0,0 +1,18 @@ +# Библиотека *utf8* + +Библиотека предоставляет функции для работы с UTF-8. + +```lua +-- Конвертирует UTF-8 строку в Bytearray или массив чисел если +-- второй аргумент - true +utf8.tobytes(text: str, [опционально] usetable=false) -> Bytearray|table + +-- Конвертирует Bytearray или массив чисел в UTF-8 строку +utf8.tostring(bytes: Bytearray|table) -> str + +-- Возвращает длину юникод-строки +utf8.length(text: str) -> int + +-- Возвращает код первого символа строки +utf8.codepoint(chars: str) -> int +``` From 137e6fc7678d67ae7e64c0d4b00140063191bf3f Mon Sep 17 00:00:00 2001 From: MihailRis Date: Sat, 26 Oct 2024 11:24:58 +0300 Subject: [PATCH 7/7] add utf8.sub --- doc/en/scripting/builtins/libutf8.md | 3 +++ doc/ru/scripting/builtins/libutf8.md | 3 +++ src/logic/scripting/lua/libs/libutf8.cpp | 11 +++++++++++ 3 files changed, 17 insertions(+) diff --git a/doc/en/scripting/builtins/libutf8.md b/doc/en/scripting/builtins/libutf8.md index 55851f88..e7801f97 100644 --- a/doc/en/scripting/builtins/libutf8.md +++ b/doc/en/scripting/builtins/libutf8.md @@ -15,4 +15,7 @@ utf8.length(text: str) -> int -- Returns the code of the first character of the string utf8.codepoint(chars: str) -> int + +-- Returns a substring from position startchar to endchar inclusive +utf8.sub(text: str, startchar: int, [optional] endchar: int) -> str ``` diff --git a/doc/ru/scripting/builtins/libutf8.md b/doc/ru/scripting/builtins/libutf8.md index 41805b38..b29bb403 100644 --- a/doc/ru/scripting/builtins/libutf8.md +++ b/doc/ru/scripting/builtins/libutf8.md @@ -15,4 +15,7 @@ utf8.length(text: str) -> int -- Возвращает код первого символа строки utf8.codepoint(chars: str) -> int + +-- Возвращает подстроку от позиции startchar до endchar включительно +utf8.sub(text: str, startchar: int, [опционально] endchar: int) -> str ``` diff --git a/src/logic/scripting/lua/libs/libutf8.cpp b/src/logic/scripting/lua/libs/libutf8.cpp index f273284c..182f6295 100644 --- a/src/logic/scripting/lua/libs/libutf8.cpp +++ b/src/logic/scripting/lua/libs/libutf8.cpp @@ -53,10 +53,21 @@ static int l_codepoint(lua::State* L) { return lua::pushinteger(L, util::decode_utf8(size, string.data())); } +static int l_sub(lua::State* L) { + auto string = util::str2u32str_utf8(lua::require_string(L, 1)); + int start = std::max(0, static_cast(lua::tointeger(L, 2) - 1)); + int end = string.length(); + if (lua::gettop(L) >= 3) { + end = std::max(0, static_cast(lua::tointeger(L, 3) - 1)); + } + return lua::pushstring(L, util::u32str2str_utf8(string.substr(start, end))); +} + const luaL_Reg utf8lib[] = { {"tobytes", lua::wrap}, {"tostring", lua::wrap}, {"length", lua::wrap}, {"codepoint", lua::wrap}, + {"sub", lua::wrap}, {NULL, NULL} };