fix utf-8 decoder & add u32string functions
This commit is contained in:
parent
bd87a586d8
commit
4343e81e00
@ -116,20 +116,22 @@ const utf_t utf[] = {
|
||||
};
|
||||
|
||||
inline uint utf8_len(ubyte cp) {
|
||||
uint len = 0;
|
||||
for (const utf_t* u = utf; u->mask; ++u) {
|
||||
if ((cp >= u->beg) && (cp <= u->end)) {
|
||||
break;
|
||||
if ((cp & 0x80) == 0) {
|
||||
return 1;
|
||||
}
|
||||
++len;
|
||||
if ((cp & 0xE0) == 0xC0) {
|
||||
return 2;
|
||||
}
|
||||
if (len > 4) /* Out of bounds */
|
||||
throw std::runtime_error("utf-8 decode error");
|
||||
|
||||
return len;
|
||||
if ((cp & 0xF0) == 0xE0) {
|
||||
return 3;
|
||||
}
|
||||
if ((cp & 0xF8) == 0xF0) {
|
||||
return 4;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
extern uint32_t util::decode_utf8(uint& size, const char* chr) {
|
||||
uint32_t util::decode_utf8(uint& size, const char* chr) {
|
||||
size = utf8_len(*chr);
|
||||
int shift = utf[0].bits_stored * (size - 1);
|
||||
uint32_t code = (*chr++ & utf[size].mask) << shift;
|
||||
@ -145,7 +147,7 @@ size_t util::crop_utf8(std::string_view s, size_t maxSize) {
|
||||
size_t pos = 0;
|
||||
uint size = 0;
|
||||
while (pos < s.length()) {
|
||||
decode_utf8(size, &s.at(pos));
|
||||
decode_utf8(size, s.data() + pos);
|
||||
if (pos + size > maxSize) {
|
||||
return pos;
|
||||
}
|
||||
@ -154,11 +156,13 @@ size_t util::crop_utf8(std::string_view s, size_t maxSize) {
|
||||
return pos;
|
||||
}
|
||||
|
||||
std::string util::wstr2str_utf8(const std::wstring& ws) {
|
||||
template<class C>
|
||||
std::string xstr2str_utf8(const std::basic_string<C>& xs) {
|
||||
std::vector<char> chars;
|
||||
char buffer[4];
|
||||
for (wchar_t wc : ws) {
|
||||
uint size = encode_utf8((uint)wc, (ubyte*)buffer);
|
||||
ubyte buffer[4];
|
||||
for (C xc : xs) {
|
||||
uint size = util::encode_utf8(
|
||||
static_cast<uint>(xc), buffer);
|
||||
for (uint i = 0; i < size; i++) {
|
||||
chars.push_back(buffer[i]);
|
||||
}
|
||||
@ -166,15 +170,32 @@ std::string util::wstr2str_utf8(const std::wstring& ws) {
|
||||
return std::string(chars.data(), chars.size());
|
||||
}
|
||||
|
||||
std::wstring util::str2wstr_utf8(const std::string& s) {
|
||||
std::vector<wchar_t> chars;
|
||||
std::string util::wstr2str_utf8(const std::wstring& ws) {
|
||||
return xstr2str_utf8(ws);
|
||||
}
|
||||
|
||||
std::string util::u32str2str_utf8(const std::u32string& ws) {
|
||||
return xstr2str_utf8(ws);
|
||||
}
|
||||
|
||||
template<class C>
|
||||
std::basic_string<C> str2xstr_utf8(const std::string& s) {
|
||||
std::vector<C> chars;
|
||||
size_t pos = 0;
|
||||
uint size = 0;
|
||||
while (pos < s.length()) {
|
||||
chars.push_back(decode_utf8(size, &s.at(pos)));
|
||||
chars.push_back(util::decode_utf8(size, &s.at(pos)));
|
||||
pos += size;
|
||||
}
|
||||
return std::wstring(chars.data(), chars.size());
|
||||
return std::basic_string<C>(chars.data(), chars.size());
|
||||
}
|
||||
|
||||
std::wstring util::str2wstr_utf8(const std::string& s) {
|
||||
return str2xstr_utf8<wchar_t>(s);
|
||||
}
|
||||
|
||||
std::u32string util::str2u32str_utf8(const std::string& s) {
|
||||
return str2xstr_utf8<char32_t>(s);
|
||||
}
|
||||
|
||||
bool util::is_integer(const std::string& text) {
|
||||
|
||||
@ -23,11 +23,21 @@ namespace util {
|
||||
/// @return new UTF-8 encoded string
|
||||
std::string wstr2str_utf8(const std::wstring& ws);
|
||||
|
||||
/// @brief Decode UTF
|
||||
/// @brief Decode UTF-8 string
|
||||
/// @param s source encoded string
|
||||
/// @return new raw decoded string
|
||||
/// @return new raw decoded wstring
|
||||
std::wstring str2wstr_utf8(const std::string& s);
|
||||
|
||||
/// @brief Encode raw u32string to UTF-8
|
||||
/// @param ws source raw wstring
|
||||
/// @return new UTF-8 encoded string
|
||||
std::string u32str2str_utf8(const std::u32string& ws);
|
||||
|
||||
/// @brief Decode UTF-8 string
|
||||
/// @param s source encoded string
|
||||
/// @return new raw decoded u32string
|
||||
std::u32string str2u32str_utf8(const std::string& s);
|
||||
|
||||
/// @brief Calculated length of UTF-8 encoded string that fits into maxSize
|
||||
/// @param s source UTF-8 encoded string view
|
||||
/// @param maxSize max encoded string length after crop
|
||||
|
||||
@ -8,3 +8,10 @@ TEST(stringutil, crop_utf8) {
|
||||
str = str.substr(0, util::crop_utf8(str, 7));
|
||||
EXPECT_EQ(str, u8"при");
|
||||
}
|
||||
|
||||
TEST(stringutil, utf8) {
|
||||
std::string str = u8"テキストデモ";
|
||||
auto u32str = util::str2u32str_utf8(str);
|
||||
std::string str2 = util::u32str2str_utf8(u32str);
|
||||
EXPECT_EQ(str, str2);
|
||||
}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user