fix unicode string literal escape

This commit is contained in:
MihailRis 2025-09-23 00:08:05 +03:00
parent 8b38d57966
commit 28c821006a
5 changed files with 63 additions and 6 deletions

View File

@ -32,7 +32,7 @@ protected:
void goBack(size_t count = 1);
void reset();
int64_t parseSimpleInt(int base);
int64_t parseSimpleInt(int base, size_t maxLength = 0xFFFFFFFF);
dv::value parseNumber(int sign);
dv::value parseNumber();
StringT parseString(CharT chr, bool closeRequired = true);

View File

@ -349,7 +349,10 @@ std::basic_string<CharT> BasicParser<CharT>::parseXmlName() {
}
template <typename CharT>
int64_t BasicParser<CharT>::parseSimpleInt(int base) {
int64_t BasicParser<CharT>::parseSimpleInt(int base, size_t maxLength) {
if (maxLength == 0) return 0;
size_t start = pos;
CharT c = peek();
int index = hexchar2int(c);
if (index == -1 || index >= base) {
@ -357,7 +360,7 @@ int64_t BasicParser<CharT>::parseSimpleInt(int base) {
}
int64_t value = index;
pos++;
while (hasNext()) {
while (hasNext() && pos - start < maxLength) {
c = source[pos];
while (c == '_') {
c = source[++pos];
@ -476,7 +479,7 @@ std::basic_string<CharT> BasicParser<CharT>::parseString(
continue;
}
if (c == 'u' || c == 'x') {
int codepoint = parseSimpleInt(16);
int codepoint = parseSimpleInt(16, c == 'u' ? 4 : 2);
ubyte bytes[4];
int size = util::encode_utf8(codepoint, bytes);
CharT chars[4];

View File

@ -22,7 +22,7 @@ namespace markdown {
Result<wchar_t> process(std::wstring_view source, bool eraseMarkdown);
template <typename CharT>
inline std::basic_string<CharT> escape(std::string_view source) {
inline std::basic_string<CharT> escape(std::basic_string_view<CharT> source) {
std::basic_stringstream<CharT> ss;
int pos = 0;
while (pos < source.size()) {

View File

@ -40,7 +40,7 @@ std::string util::escape(std::string_view s, bool escapeUnicode) {
uint cpsize;
int codepoint = decode_utf8(cpsize, s.data() + pos);
if (escapeUnicode) {
ss << "\\u" << std::hex << codepoint;
ss << "\\u" << std::setw(4) << std::setfill('0') << std::hex << codepoint;
} else {
ss << std::string(s.data() + pos, cpsize);
}

View File

@ -1,4 +1,5 @@
#include "util/stringutil.hpp"
#include "coders/BasicParser.hpp"
#include <gtest/gtest.h>
@ -16,6 +17,25 @@ TEST(stringutil, utf8) {
EXPECT_EQ(str, str2);
}
static std::wstring gen_random_unicode_wstring(int n) {
std::wstring str;
str.resize(n);
for (int i = 0; i < n; i++) {
// wstring is 16 bit in some systems
str[i] = rand() & 0xFFFF;
}
return str;
}
TEST(stringutil, utf8_random) {
srand(5436324);
auto str = gen_random_unicode_wstring(10'000);
auto utf8str = util::wstr2str_utf8(str);
auto back = util::str2wstr_utf8(utf8str);
EXPECT_EQ(str, back);
}
TEST(stringutil, base64) {
srand(2019);
for (size_t size = 0; size < 30; size++) {
@ -47,3 +67,37 @@ TEST(stringutil, base64_urlsafe) {
}
}
}
class StringParser : BasicParser<char> {
public:
StringParser(std::string_view source) : BasicParser("<string>", source) {}
std::string parse() {
++pos;
return parseString(source[0], true);
}
};
TEST(stringutil, escape_cases) {
auto escaped = util::escape("тест5", true);
auto expected = "\"\\u0442\\u0435\\u0441\\u04425\"";
ASSERT_EQ(expected, escaped);
srand(345873458);
for (int i = 0; i < 36; i++) {
rand();
}
auto str = gen_random_unicode_wstring(40);
auto utf8str = util::wstr2str_utf8(str);
escaped = util::escape(utf8str, true);
StringParser parser(escaped);
auto restored = parser.parse();
for (int i = 0; i < utf8str.length(); i++) {
if (utf8str[i] != restored[i]) {
std::cout << i << ": " << (int)utf8str[i] << " " << (int)restored[i] << std::endl;
}
}
EXPECT_EQ(utf8str, restored);
}