diff --git a/doc/en/scripting.md b/doc/en/scripting.md index 2ed46bdb..d1fdc645 100644 --- a/doc/en/scripting.md +++ b/doc/en/scripting.md @@ -21,6 +21,7 @@ Subsections: - [player](scripting/builtins/libplayer.md) - [quat](scripting/builtins/libquat.md) - [time](scripting/builtins/libtime.md) + - [utf8](scripting/builtins/libutf8.md) - [vec2, vec3, vec4](scripting/builtins/libvecn.md) - [world](scripting/builtins/libworld.md) - [Module core:bit_converter](scripting/modules/core_bit_converter.md) diff --git a/doc/en/scripting/builtins/libutf8.md b/doc/en/scripting/builtins/libutf8.md new file mode 100644 index 00000000..e7801f97 --- /dev/null +++ b/doc/en/scripting/builtins/libutf8.md @@ -0,0 +1,21 @@ +# *utf8* library + +The library provides functions for working with UTF-8. + +```lua +-- Converts a UTF-8 string to a Bytearray or an array of numbers if +-- the second argument is true +utf8.tobytes(text: str, [optional] usetable=false) -> Bytearray|table + +-- Converts a Bytearray or an array of numbers to a UTF-8 string +utf8.tostring(bytes: Bytearray|table) -> str + +-- Returns the length of a Unicode string +utf8.length(text: str) -> int + +-- Returns the code of the first character of the string +utf8.codepoint(chars: str) -> int + +-- Returns a substring from position startchar to endchar inclusive +utf8.sub(text: str, startchar: int, [optional] endchar: int) -> str +``` diff --git a/doc/ru/scripting.md b/doc/ru/scripting.md index 1f582d46..0900788e 100644 --- a/doc/ru/scripting.md +++ b/doc/ru/scripting.md @@ -21,6 +21,7 @@ - [player](scripting/builtins/libplayer.md) - [quat](scripting/builtins/libquat.md) - [time](scripting/builtins/libtime.md) + - [utf8](scripting/builtins/libutf8.md) - [vec2, vec3, vec4](scripting/builtins/libvecn.md) - [world](scripting/builtins/libworld.md) - [Модуль core:bit_converter](scripting/modules/core_bit_converter.md) diff --git a/doc/ru/scripting/builtins/libutf8.md b/doc/ru/scripting/builtins/libutf8.md new file mode 100644 index 00000000..b29bb403 --- /dev/null +++ b/doc/ru/scripting/builtins/libutf8.md @@ -0,0 +1,21 @@ +# Библиотека *utf8* + +Библиотека предоставляет функции для работы с UTF-8. + +```lua +-- Конвертирует UTF-8 строку в Bytearray или массив чисел если +-- второй аргумент - true +utf8.tobytes(text: str, [опционально] usetable=false) -> Bytearray|table + +-- Конвертирует Bytearray или массив чисел в UTF-8 строку +utf8.tostring(bytes: Bytearray|table) -> str + +-- Возвращает длину юникод-строки +utf8.length(text: str) -> int + +-- Возвращает код первого символа строки +utf8.codepoint(chars: str) -> int + +-- Возвращает подстроку от позиции startchar до endchar включительно +utf8.sub(text: str, startchar: int, [опционально] endchar: int) -> str +``` diff --git a/src/logic/scripting/lua/libs/api_lua.hpp b/src/logic/scripting/lua/libs/api_lua.hpp index 2ac9c1e8..f9c8b3f8 100644 --- a/src/logic/scripting/lua/libs/api_lua.hpp +++ b/src/logic/scripting/lua/libs/api_lua.hpp @@ -35,6 +35,7 @@ extern const luaL_Reg playerlib[]; extern const luaL_Reg quatlib[]; // quat.cpp extern const luaL_Reg timelib[]; extern const luaL_Reg tomllib[]; +extern const luaL_Reg utf8lib[]; extern const luaL_Reg vec2lib[]; // vecn.cpp extern const luaL_Reg vec3lib[]; // vecn.cpp extern const luaL_Reg vec4lib[]; // vecn.cpp diff --git a/src/logic/scripting/lua/libs/libutf8.cpp b/src/logic/scripting/lua/libs/libutf8.cpp new file mode 100644 index 00000000..182f6295 --- /dev/null +++ b/src/logic/scripting/lua/libs/libutf8.cpp @@ -0,0 +1,73 @@ +#include "api_lua.hpp" + +#include + +#include "../lua_custom_types.hpp" +#include "util/stringutil.hpp" + +static int l_encode(lua::State* L) { + std::string_view string = lua::require_string(L, 1); + if (lua::toboolean(L, 2)) { + lua::createtable(L, string.length(), 0); + for (size_t i = 0; i < string.length(); i++) { + lua::pushinteger(L, string[i] & 0xFF); + lua::rawseti(L, i+1); + } + } else { + lua::newuserdata(L, string.length()); + auto bytearray = lua::touserdata(L, -1); + bytearray->data().reserve(string.length()); + std::memcpy(bytearray->data().data(), string.data(), string.length()); + } + return 1; +} + +static int l_decode(lua::State* L) { + if (lua::istable(L, 1)) { + size_t size = lua::objlen(L, 1); + util::Buffer buffer(size); + return lua::pushstring(L, std::string(buffer.data(), size)); + } else if (auto bytes = lua::touserdata(L, 1)) { + return lua::pushstring( + L, + std::string( + reinterpret_cast(bytes->data().data()), + bytes->data().size() + ) + ); + } + return 1; +} + +static int l_length(lua::State* L) { + auto string = lua::require_string(L, 1); + return lua::pushinteger(L, util::length_utf8(string)); +} + +static int l_codepoint(lua::State* L) { + std::string_view string = lua::require_string(L, 1); + if (string.empty()) { + return lua::pushinteger(L, 0); + } + uint size; + return lua::pushinteger(L, util::decode_utf8(size, string.data())); +} + +static int l_sub(lua::State* L) { + auto string = util::str2u32str_utf8(lua::require_string(L, 1)); + int start = std::max(0, static_cast(lua::tointeger(L, 2) - 1)); + int end = string.length(); + if (lua::gettop(L) >= 3) { + end = std::max(0, static_cast(lua::tointeger(L, 3) - 1)); + } + return lua::pushstring(L, util::u32str2str_utf8(string.substr(start, end))); +} + +const luaL_Reg utf8lib[] = { + {"tobytes", lua::wrap}, + {"tostring", lua::wrap}, + {"length", lua::wrap}, + {"codepoint", lua::wrap}, + {"sub", lua::wrap}, + {NULL, NULL} +}; diff --git a/src/logic/scripting/lua/lua_engine.cpp b/src/logic/scripting/lua/lua_engine.cpp index fd40e612..fe3e75b5 100644 --- a/src/logic/scripting/lua/lua_engine.cpp +++ b/src/logic/scripting/lua/lua_engine.cpp @@ -51,6 +51,7 @@ static void create_libs(State* L, StateType stateType) { openlib(L, "quat", quatlib); openlib(L, "time", timelib); openlib(L, "toml", tomllib); + openlib(L, "utf8", utf8lib); openlib(L, "vec2", vec2lib); openlib(L, "vec3", vec3lib); openlib(L, "vec4", vec4lib); diff --git a/src/util/stringutil.cpp b/src/util/stringutil.cpp index c104bd5e..12936343 100644 --- a/src/util/stringutil.cpp +++ b/src/util/stringutil.cpp @@ -128,7 +128,7 @@ inline uint utf8_len(ubyte cp) { if ((cp & 0xF8) == 0xF0) { return 4; } - return 0; + throw std::runtime_error("utf8 decode error"); } uint32_t util::decode_utf8(uint& size, const char* chr) { @@ -156,6 +156,16 @@ size_t util::crop_utf8(std::string_view s, size_t maxSize) { return pos; } +size_t util::length_utf8(std::string_view s) { + size_t length = 0; + size_t pos = 0; + while (pos < s.length()) { + pos += utf8_len(s[pos]); + length++; + } + return length; +} + template std::string xstr2str_utf8(const std::basic_string& xs) { std::vector chars; diff --git a/src/util/stringutil.hpp b/src/util/stringutil.hpp index c7bc2fb5..f584a53c 100644 --- a/src/util/stringutil.hpp +++ b/src/util/stringutil.hpp @@ -44,6 +44,11 @@ namespace util { /// @param maxSize max encoded string length after crop /// @return cropped string size (less or equal to maxSize) size_t crop_utf8(std::string_view s, size_t maxSize); + + /// @brief Measure utf8-encoded string length + /// @param s source encoded string + /// @return unicode string length (number of codepoints) + size_t length_utf8(std::string_view s); bool is_integer(const std::string& text); bool is_integer(const std::wstring& text);