diff --git a/src/util/stringutil.cpp b/src/util/stringutil.cpp index 8f3bddd8..97ca4786 100644 --- a/src/util/stringutil.cpp +++ b/src/util/stringutil.cpp @@ -116,20 +116,22 @@ const utf_t utf[] = { }; inline uint utf8_len(ubyte cp) { - uint len = 0; - for (const utf_t* u = utf; u->mask; ++u) { - if ((cp >= u->beg) && (cp <= u->end)) { - break; - } - ++len; + if ((cp & 0x80) == 0) { + return 1; } - if (len > 4) /* Out of bounds */ - throw std::runtime_error("utf-8 decode error"); - - return len; + if ((cp & 0xE0) == 0xC0) { + return 2; + } + if ((cp & 0xF0) == 0xE0) { + return 3; + } + if ((cp & 0xF8) == 0xF0) { + return 4; + } + return 0; } -extern uint32_t util::decode_utf8(uint& size, const char* chr) { +uint32_t util::decode_utf8(uint& size, const char* chr) { size = utf8_len(*chr); int shift = utf[0].bits_stored * (size - 1); uint32_t code = (*chr++ & utf[size].mask) << shift; @@ -145,7 +147,7 @@ size_t util::crop_utf8(std::string_view s, size_t maxSize) { size_t pos = 0; uint size = 0; while (pos < s.length()) { - decode_utf8(size, &s.at(pos)); + decode_utf8(size, s.data() + pos); if (pos + size > maxSize) { return pos; } @@ -154,11 +156,13 @@ size_t util::crop_utf8(std::string_view s, size_t maxSize) { return pos; } -std::string util::wstr2str_utf8(const std::wstring& ws) { +template +std::string xstr2str_utf8(const std::basic_string& xs) { std::vector chars; - char buffer[4]; - for (wchar_t wc : ws) { - uint size = encode_utf8((uint)wc, (ubyte*)buffer); + ubyte buffer[4]; + for (C xc : xs) { + uint size = util::encode_utf8( + static_cast(xc), buffer); for (uint i = 0; i < size; i++) { chars.push_back(buffer[i]); } @@ -166,15 +170,32 @@ std::string util::wstr2str_utf8(const std::wstring& ws) { return std::string(chars.data(), chars.size()); } -std::wstring util::str2wstr_utf8(const std::string& s) { - std::vector chars; +std::string util::wstr2str_utf8(const std::wstring& ws) { + return xstr2str_utf8(ws); +} + +std::string util::u32str2str_utf8(const std::u32string& ws) { + return xstr2str_utf8(ws); +} + +template +std::basic_string str2xstr_utf8(const std::string& s) { + std::vector chars; size_t pos = 0; uint size = 0; while (pos < s.length()) { - chars.push_back(decode_utf8(size, &s.at(pos))); + chars.push_back(util::decode_utf8(size, &s.at(pos))); pos += size; } - return std::wstring(chars.data(), chars.size()); + return std::basic_string(chars.data(), chars.size()); +} + +std::wstring util::str2wstr_utf8(const std::string& s) { + return str2xstr_utf8(s); +} + +std::u32string util::str2u32str_utf8(const std::string& s) { + return str2xstr_utf8(s); } bool util::is_integer(const std::string& text) { diff --git a/src/util/stringutil.hpp b/src/util/stringutil.hpp index 14ba9432..38e7cf89 100644 --- a/src/util/stringutil.hpp +++ b/src/util/stringutil.hpp @@ -23,11 +23,21 @@ namespace util { /// @return new UTF-8 encoded string std::string wstr2str_utf8(const std::wstring& ws); - /// @brief Decode UTF + /// @brief Decode UTF-8 string /// @param s source encoded string - /// @return new raw decoded string + /// @return new raw decoded wstring std::wstring str2wstr_utf8(const std::string& s); + /// @brief Encode raw u32string to UTF-8 + /// @param ws source raw wstring + /// @return new UTF-8 encoded string + std::string u32str2str_utf8(const std::u32string& ws); + + /// @brief Decode UTF-8 string + /// @param s source encoded string + /// @return new raw decoded u32string + std::u32string str2u32str_utf8(const std::string& s); + /// @brief Calculated length of UTF-8 encoded string that fits into maxSize /// @param s source UTF-8 encoded string view /// @param maxSize max encoded string length after crop diff --git a/test/util/stringutil.cpp b/test/util/stringutil.cpp index ed6e2b00..7bd0f85c 100644 --- a/test/util/stringutil.cpp +++ b/test/util/stringutil.cpp @@ -8,3 +8,10 @@ TEST(stringutil, crop_utf8) { str = str.substr(0, util::crop_utf8(str, 7)); EXPECT_EQ(str, u8"при"); } + +TEST(stringutil, utf8) { + std::string str = u8"テキストデモ"; + auto u32str = util::str2u32str_utf8(str); + std::string str2 = util::u32str2str_utf8(u32str); + EXPECT_EQ(str, str2); +}