diff --git a/src/coders/BasicParser.hpp b/src/coders/BasicParser.hpp index e0a4445a..3b756f57 100644 --- a/src/coders/BasicParser.hpp +++ b/src/coders/BasicParser.hpp @@ -32,7 +32,7 @@ protected: void goBack(size_t count = 1); void reset(); - int64_t parseSimpleInt(int base); + int64_t parseSimpleInt(int base, size_t maxLength = 0xFFFFFFFF); dv::value parseNumber(int sign); dv::value parseNumber(); StringT parseString(CharT chr, bool closeRequired = true); diff --git a/src/coders/BasicParser.inl b/src/coders/BasicParser.inl index 2e7a3623..7af27fe5 100644 --- a/src/coders/BasicParser.inl +++ b/src/coders/BasicParser.inl @@ -349,7 +349,10 @@ std::basic_string BasicParser::parseXmlName() { } template -int64_t BasicParser::parseSimpleInt(int base) { +int64_t BasicParser::parseSimpleInt(int base, size_t maxLength) { + if (maxLength == 0) return 0; + + size_t start = pos; CharT c = peek(); int index = hexchar2int(c); if (index == -1 || index >= base) { @@ -357,7 +360,7 @@ int64_t BasicParser::parseSimpleInt(int base) { } int64_t value = index; pos++; - while (hasNext()) { + while (hasNext() && pos - start < maxLength) { c = source[pos]; while (c == '_') { c = source[++pos]; @@ -476,7 +479,7 @@ std::basic_string BasicParser::parseString( continue; } if (c == 'u' || c == 'x') { - int codepoint = parseSimpleInt(16); + int codepoint = parseSimpleInt(16, c == 'u' ? 4 : 2); ubyte bytes[4]; int size = util::encode_utf8(codepoint, bytes); CharT chars[4]; diff --git a/src/graphics/ui/markdown.hpp b/src/graphics/ui/markdown.hpp index 0f089d58..39b2f1ec 100644 --- a/src/graphics/ui/markdown.hpp +++ b/src/graphics/ui/markdown.hpp @@ -22,7 +22,7 @@ namespace markdown { Result process(std::wstring_view source, bool eraseMarkdown); template - inline std::basic_string escape(std::string_view source) { + inline std::basic_string escape(std::basic_string_view source) { std::basic_stringstream ss; int pos = 0; while (pos < source.size()) { diff --git a/src/util/stringutil.cpp b/src/util/stringutil.cpp index 79f8a1c5..63271a13 100644 --- a/src/util/stringutil.cpp +++ b/src/util/stringutil.cpp @@ -40,7 +40,7 @@ std::string util::escape(std::string_view s, bool escapeUnicode) { uint cpsize; int codepoint = decode_utf8(cpsize, s.data() + pos); if (escapeUnicode) { - ss << "\\u" << std::hex << codepoint; + ss << "\\u" << std::setw(4) << std::setfill('0') << std::hex << codepoint; } else { ss << std::string(s.data() + pos, cpsize); } diff --git a/test/util/stringutil.cpp b/test/util/stringutil.cpp index fd459663..22df2b35 100644 --- a/test/util/stringutil.cpp +++ b/test/util/stringutil.cpp @@ -1,4 +1,5 @@ #include "util/stringutil.hpp" +#include "coders/BasicParser.hpp" #include @@ -16,6 +17,25 @@ TEST(stringutil, utf8) { EXPECT_EQ(str, str2); } +static std::wstring gen_random_unicode_wstring(int n) { + std::wstring str; + str.resize(n); + for (int i = 0; i < n; i++) { + // wstring is 16 bit in some systems + str[i] = rand() & 0xFFFF; + } + return str; +} + +TEST(stringutil, utf8_random) { + srand(5436324); + + auto str = gen_random_unicode_wstring(10'000); + auto utf8str = util::wstr2str_utf8(str); + auto back = util::str2wstr_utf8(utf8str); + EXPECT_EQ(str, back); +} + TEST(stringutil, base64) { srand(2019); for (size_t size = 0; size < 30; size++) { @@ -47,3 +67,37 @@ TEST(stringutil, base64_urlsafe) { } } } + +class StringParser : BasicParser { +public: + StringParser(std::string_view source) : BasicParser("", source) {} + + std::string parse() { + ++pos; + return parseString(source[0], true); + } +}; + +TEST(stringutil, escape_cases) { + auto escaped = util::escape("ั‚ะตัั‚5", true); + auto expected = "\"\\u0442\\u0435\\u0441\\u04425\""; + ASSERT_EQ(expected, escaped); + + srand(345873458); + for (int i = 0; i < 36; i++) { + rand(); + } + + auto str = gen_random_unicode_wstring(40); + auto utf8str = util::wstr2str_utf8(str); + escaped = util::escape(utf8str, true); + + StringParser parser(escaped); + auto restored = parser.parse(); + for (int i = 0; i < utf8str.length(); i++) { + if (utf8str[i] != restored[i]) { + std::cout << i << ": " << (int)utf8str[i] << " " << (int)restored[i] << std::endl; + } + } + EXPECT_EQ(utf8str, restored); +}