fix utf-8 decoder & add u32string functions

This commit is contained in:
MihailRis 2024-08-29 15:09:27 +03:00
parent bd87a586d8
commit 4343e81e00
3 changed files with 60 additions and 22 deletions

View File

@ -116,20 +116,22 @@ const utf_t utf[] = {
};
inline uint utf8_len(ubyte cp) {
uint len = 0;
for (const utf_t* u = utf; u->mask; ++u) {
if ((cp >= u->beg) && (cp <= u->end)) {
break;
if ((cp & 0x80) == 0) {
return 1;
}
++len;
if ((cp & 0xE0) == 0xC0) {
return 2;
}
if (len > 4) /* Out of bounds */
throw std::runtime_error("utf-8 decode error");
return len;
if ((cp & 0xF0) == 0xE0) {
return 3;
}
if ((cp & 0xF8) == 0xF0) {
return 4;
}
return 0;
}
extern uint32_t util::decode_utf8(uint& size, const char* chr) {
uint32_t util::decode_utf8(uint& size, const char* chr) {
size = utf8_len(*chr);
int shift = utf[0].bits_stored * (size - 1);
uint32_t code = (*chr++ & utf[size].mask) << shift;
@ -145,7 +147,7 @@ size_t util::crop_utf8(std::string_view s, size_t maxSize) {
size_t pos = 0;
uint size = 0;
while (pos < s.length()) {
decode_utf8(size, &s.at(pos));
decode_utf8(size, s.data() + pos);
if (pos + size > maxSize) {
return pos;
}
@ -154,11 +156,13 @@ size_t util::crop_utf8(std::string_view s, size_t maxSize) {
return pos;
}
std::string util::wstr2str_utf8(const std::wstring& ws) {
template<class C>
std::string xstr2str_utf8(const std::basic_string<C>& xs) {
std::vector<char> chars;
char buffer[4];
for (wchar_t wc : ws) {
uint size = encode_utf8((uint)wc, (ubyte*)buffer);
ubyte buffer[4];
for (C xc : xs) {
uint size = util::encode_utf8(
static_cast<uint>(xc), buffer);
for (uint i = 0; i < size; i++) {
chars.push_back(buffer[i]);
}
@ -166,15 +170,32 @@ std::string util::wstr2str_utf8(const std::wstring& ws) {
return std::string(chars.data(), chars.size());
}
std::wstring util::str2wstr_utf8(const std::string& s) {
std::vector<wchar_t> chars;
std::string util::wstr2str_utf8(const std::wstring& ws) {
return xstr2str_utf8(ws);
}
std::string util::u32str2str_utf8(const std::u32string& ws) {
return xstr2str_utf8(ws);
}
template<class C>
std::basic_string<C> str2xstr_utf8(const std::string& s) {
std::vector<C> chars;
size_t pos = 0;
uint size = 0;
while (pos < s.length()) {
chars.push_back(decode_utf8(size, &s.at(pos)));
chars.push_back(util::decode_utf8(size, &s.at(pos)));
pos += size;
}
return std::wstring(chars.data(), chars.size());
return std::basic_string<C>(chars.data(), chars.size());
}
std::wstring util::str2wstr_utf8(const std::string& s) {
return str2xstr_utf8<wchar_t>(s);
}
std::u32string util::str2u32str_utf8(const std::string& s) {
return str2xstr_utf8<char32_t>(s);
}
bool util::is_integer(const std::string& text) {

View File

@ -23,11 +23,21 @@ namespace util {
/// @return new UTF-8 encoded string
std::string wstr2str_utf8(const std::wstring& ws);
/// @brief Decode UTF
/// @brief Decode UTF-8 string
/// @param s source encoded string
/// @return new raw decoded string
/// @return new raw decoded wstring
std::wstring str2wstr_utf8(const std::string& s);
/// @brief Encode raw u32string to UTF-8
/// @param ws source raw wstring
/// @return new UTF-8 encoded string
std::string u32str2str_utf8(const std::u32string& ws);
/// @brief Decode UTF-8 string
/// @param s source encoded string
/// @return new raw decoded u32string
std::u32string str2u32str_utf8(const std::string& s);
/// @brief Calculated length of UTF-8 encoded string that fits into maxSize
/// @param s source UTF-8 encoded string view
/// @param maxSize max encoded string length after crop

View File

@ -8,3 +8,10 @@ TEST(stringutil, crop_utf8) {
str = str.substr(0, util::crop_utf8(str, 7));
EXPECT_EQ(str, u8"при");
}
TEST(stringutil, utf8) {
std::string str = u8"テキストデモ";
auto u32str = util::str2u32str_utf8(str);
std::string str2 = util::u32str2str_utf8(u32str);
EXPECT_EQ(str, str2);
}