Merge pull request #329 from MihailRis/add-utf8-lib

Add Lua utf8 library
This commit is contained in:
MihailRis 2024-10-26 12:11:31 +03:00 committed by GitHub
commit 8a594c41b4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 135 additions and 1 deletions

View File

@ -21,6 +21,7 @@ Subsections:
- [player](scripting/builtins/libplayer.md)
- [quat](scripting/builtins/libquat.md)
- [time](scripting/builtins/libtime.md)
- [utf8](scripting/builtins/libutf8.md)
- [vec2, vec3, vec4](scripting/builtins/libvecn.md)
- [world](scripting/builtins/libworld.md)
- [Module core:bit_converter](scripting/modules/core_bit_converter.md)

View File

@ -0,0 +1,21 @@
# *utf8* library
The library provides functions for working with UTF-8.
```lua
-- Converts a UTF-8 string to a Bytearray or an array of numbers if
-- the second argument is true
utf8.tobytes(text: str, [optional] usetable=false) -> Bytearray|table
-- Converts a Bytearray or an array of numbers to a UTF-8 string
utf8.tostring(bytes: Bytearray|table) -> str
-- Returns the length of a Unicode string
utf8.length(text: str) -> int
-- Returns the code of the first character of the string
utf8.codepoint(chars: str) -> int
-- Returns a substring from position startchar to endchar inclusive
utf8.sub(text: str, startchar: int, [optional] endchar: int) -> str
```

View File

@ -21,6 +21,7 @@
- [player](scripting/builtins/libplayer.md)
- [quat](scripting/builtins/libquat.md)
- [time](scripting/builtins/libtime.md)
- [utf8](scripting/builtins/libutf8.md)
- [vec2, vec3, vec4](scripting/builtins/libvecn.md)
- [world](scripting/builtins/libworld.md)
- [Модуль core:bit_converter](scripting/modules/core_bit_converter.md)

View File

@ -0,0 +1,21 @@
# Библиотека *utf8*
Библиотека предоставляет функции для работы с UTF-8.
```lua
-- Конвертирует UTF-8 строку в Bytearray или массив чисел если
-- второй аргумент - true
utf8.tobytes(text: str, [опционально] usetable=false) -> Bytearray|table
-- Конвертирует Bytearray или массив чисел в UTF-8 строку
utf8.tostring(bytes: Bytearray|table) -> str
-- Возвращает длину юникод-строки
utf8.length(text: str) -> int
-- Возвращает код первого символа строки
utf8.codepoint(chars: str) -> int
-- Возвращает подстроку от позиции startchar до endchar включительно
utf8.sub(text: str, startchar: int, [опционально] endchar: int) -> str
```

View File

@ -35,6 +35,7 @@ extern const luaL_Reg playerlib[];
extern const luaL_Reg quatlib[]; // quat.cpp
extern const luaL_Reg timelib[];
extern const luaL_Reg tomllib[];
extern const luaL_Reg utf8lib[];
extern const luaL_Reg vec2lib[]; // vecn.cpp
extern const luaL_Reg vec3lib[]; // vecn.cpp
extern const luaL_Reg vec4lib[]; // vecn.cpp

View File

@ -0,0 +1,73 @@
#include "api_lua.hpp"
#include <vector>
#include "../lua_custom_types.hpp"
#include "util/stringutil.hpp"
static int l_encode(lua::State* L) {
std::string_view string = lua::require_string(L, 1);
if (lua::toboolean(L, 2)) {
lua::createtable(L, string.length(), 0);
for (size_t i = 0; i < string.length(); i++) {
lua::pushinteger(L, string[i] & 0xFF);
lua::rawseti(L, i+1);
}
} else {
lua::newuserdata<lua::LuaBytearray>(L, string.length());
auto bytearray = lua::touserdata<lua::LuaBytearray>(L, -1);
bytearray->data().reserve(string.length());
std::memcpy(bytearray->data().data(), string.data(), string.length());
}
return 1;
}
static int l_decode(lua::State* L) {
if (lua::istable(L, 1)) {
size_t size = lua::objlen(L, 1);
util::Buffer<char> buffer(size);
return lua::pushstring(L, std::string(buffer.data(), size));
} else if (auto bytes = lua::touserdata<lua::LuaBytearray>(L, 1)) {
return lua::pushstring(
L,
std::string(
reinterpret_cast<char*>(bytes->data().data()),
bytes->data().size()
)
);
}
return 1;
}
static int l_length(lua::State* L) {
auto string = lua::require_string(L, 1);
return lua::pushinteger(L, util::length_utf8(string));
}
static int l_codepoint(lua::State* L) {
std::string_view string = lua::require_string(L, 1);
if (string.empty()) {
return lua::pushinteger(L, 0);
}
uint size;
return lua::pushinteger(L, util::decode_utf8(size, string.data()));
}
static int l_sub(lua::State* L) {
auto string = util::str2u32str_utf8(lua::require_string(L, 1));
int start = std::max(0, static_cast<int>(lua::tointeger(L, 2) - 1));
int end = string.length();
if (lua::gettop(L) >= 3) {
end = std::max(0, static_cast<int>(lua::tointeger(L, 3) - 1));
}
return lua::pushstring(L, util::u32str2str_utf8(string.substr(start, end)));
}
const luaL_Reg utf8lib[] = {
{"tobytes", lua::wrap<l_encode>},
{"tostring", lua::wrap<l_decode>},
{"length", lua::wrap<l_length>},
{"codepoint", lua::wrap<l_codepoint>},
{"sub", lua::wrap<l_sub>},
{NULL, NULL}
};

View File

@ -51,6 +51,7 @@ static void create_libs(State* L, StateType stateType) {
openlib(L, "quat", quatlib);
openlib(L, "time", timelib);
openlib(L, "toml", tomllib);
openlib(L, "utf8", utf8lib);
openlib(L, "vec2", vec2lib);
openlib(L, "vec3", vec3lib);
openlib(L, "vec4", vec4lib);

View File

@ -128,7 +128,7 @@ inline uint utf8_len(ubyte cp) {
if ((cp & 0xF8) == 0xF0) {
return 4;
}
return 0;
throw std::runtime_error("utf8 decode error");
}
uint32_t util::decode_utf8(uint& size, const char* chr) {
@ -156,6 +156,16 @@ size_t util::crop_utf8(std::string_view s, size_t maxSize) {
return pos;
}
size_t util::length_utf8(std::string_view s) {
size_t length = 0;
size_t pos = 0;
while (pos < s.length()) {
pos += utf8_len(s[pos]);
length++;
}
return length;
}
template<class C>
std::string xstr2str_utf8(const std::basic_string<C>& xs) {
std::vector<char> chars;

View File

@ -44,6 +44,11 @@ namespace util {
/// @param maxSize max encoded string length after crop
/// @return cropped string size (less or equal to maxSize)
size_t crop_utf8(std::string_view s, size_t maxSize);
/// @brief Measure utf8-encoded string length
/// @param s source encoded string
/// @return unicode string length (number of codepoints)
size_t length_utf8(std::string_view s);
bool is_integer(const std::string& text);
bool is_integer(const std::wstring& text);