From 5ae9686adac1508873f2d980e84becd3496244c2 Mon Sep 17 00:00:00 2001 From: Karen Arutyunov Date: Wed, 26 Feb 2020 17:16:45 +0300 Subject: Add notion of validator to char_scanner and make sure manifest is UTF-8 This involves implementing utf8_validator and UTF-8 utility functions and using them during the manifest parsing, serialization, and rewriting. --- tests/utf8/driver.cxx | 207 +++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 196 insertions(+), 11 deletions(-) (limited to 'tests/utf8') diff --git a/tests/utf8/driver.cxx b/tests/utf8/driver.cxx index 8480dec..f35e65e 100644 --- a/tests/utf8/driver.cxx +++ b/tests/utf8/driver.cxx @@ -13,8 +13,10 @@ #ifdef __cpp_lib_modules_ts import std.core; #endif +import butl.utf8; import butl.utility; #else +#include #include #endif @@ -24,6 +26,17 @@ using namespace butl; int main () { + // utf8() tests. + // + auto utf8_error = [] (const string& s, + codepoint_types ts = codepoint_types::any, + const char32_t* wl = nullptr) + { + string error; + assert (!utf8 (s, error, ts, wl)); + return error; + }; + // Valid sequences. // // Empty. @@ -43,18 +56,36 @@ main () // Ill-formed sequences. // + // Long sequences. + // + assert (!utf8 ("\xF8")); // 5-byte sequence. + assert (!utf8 ("\xFC")); // 6-byte sequence. + + assert (utf8_error ("\xF8") == "5-byte length UTF-8 sequence"); + assert (utf8_error ("\xFC") == "6-byte length UTF-8 sequence"); + assert (utf8_error ("\xFE") == "invalid UTF-8 sequence first byte (0xFE)"); + // 2-byte sequences. // assert (!utf8 ("\xC1\x80")); // Invalid first byte. assert (!utf8 ("\xD0y")); // Invalid second byte. + assert (utf8_error ("\xC1\x80") == + "invalid UTF-8 sequence first byte (0xC1)"); + + assert (utf8_error ("\xD0y") == + "invalid UTF-8 sequence second byte (0x79 'y')"); + // 3-byte sequences. // assert (!utf8 ("\xE2\x70\x80")); // Invalid second byte. assert (!utf8 ("\xE2\x80\x70")); // Invalid third byte. - assert (!utf8 ("\xED\xA0\x80")); // Min UTF-16 surrogate value. - assert (!utf8 ("\xED\xBF\xBF")); // Max UTF-16 surrogate value. + assert (!utf8 ("\xED\xA0\x80")); // Min UTF-16 surrogate. + assert (!utf8 ("\xED\xBF\xBF")); // Max UTF-16 surrogate. + + assert (utf8_error ("\xE2\x80\x70") == + "invalid UTF-8 sequence third byte (0x70 'p')"); // 4-byte sequences. // @@ -63,9 +94,8 @@ main () assert (!utf8 ("\xF0\x90\x70\x80")); // Invalid third byte. assert (!utf8 ("\xF1\x80\x80\xC0")); // Invalid forth byte. - // Out of the codepoint range (0x10ffff + 1). - // - assert (!utf8 ("\xF4\x90\x80\x80")); + assert (utf8_error ("\xF1\x80\x80\xC0") == + "invalid UTF-8 sequence forth byte (0xC0)"); // Incomplete sequences. // @@ -73,14 +103,25 @@ main () assert (!utf8 ("\xE4\xBA")); // 3-byte sequence. assert (!utf8 ("\xF0\x90\x8C")); // 4-byte sequence. + assert (utf8_error ("\xD0") == "incomplete UTF-8 sequence"); + // Missing sequence leading bytes. // - assert (!utf8 ("\xB0xyz")); // 2-byte sequence. - assert (!utf8 ("\xBA\x8Cxyz")); // 3-byte sequence. - assert (!utf8 ("\x8Cxyz")); // 3-byte sequence. - assert (!utf8 ("\x90\x8C\x82xyz")); // 4-byte sequence. - assert (!utf8 ("\x8C\x82xyz")); // 4-byte sequence. - assert (!utf8 ("\x82xyz")); // 4-byte sequence. + assert (!utf8 ("\xB0xyz")); // 2-byte sequence. + assert (!utf8 ("\xBA\x8C\xD0\xB0yz")); // 3-byte sequence. + assert (!utf8 ("\x8Cxyz")); // 3-byte sequence. + assert (!utf8 ("\x90\x8C\x82xyz")); // 4-byte sequence. + assert (!utf8 ("\x8C\x82xyz")); // 4-byte sequence. + assert (!utf8 ("\x82xyz")); // 4-byte sequence. + + assert (utf8_error ("\xB0") == "invalid UTF-8 sequence first byte (0xB0)"); + + // Above the valid codepoint range (0x10ffff + 1). + // + assert (!utf8 ("\xF4\x90\x80\x80")); + + assert (utf8_error ("\xF4\x90\x80\x80") == + "invalid UTF-8 sequence second byte (0x90)"); // Whitelisting. // @@ -145,6 +186,9 @@ main () assert (!utf8 ("\xF3\xA1\x80\x80", codepoint_types::graphic)); // Reserved. assert (!utf8 ("\xF3\xA0\x81\xBF", codepoint_types::graphic)); // Format. + assert (utf8_error ("\xF3\xA0\x81\xBF", codepoint_types::graphic) == + "invalid Unicode codepoint (format)"); + assert (!utf8 ("\xC2\xAC", codepoint_types::format)); // Graphic. // Private-use & Graphic. @@ -153,4 +197,145 @@ main () codepoint_types::format)); assert (!utf8 ("a", codepoint_types::none)); // None. + + assert (utf8_error ("a", codepoint_types::none) == + "invalid Unicode codepoint (graphic)"); + + // UTF-8 string length. + // + auto invalid_utf8 = [] (string s, codepoint_types ts = codepoint_types::any) + { + try + { + utf8_length (s, ts); + return false; + } + catch (const invalid_argument&) + { + return true; + } + }; + + assert (utf8_length ("") == 0); + assert (utf8_length ("x\xD0\xB0\xE4\xBA\x8C\xF0\x90\x8C\x82y") == 5); + + assert (invalid_utf8 ("\xFE")); // Invalid byte. + assert (invalid_utf8 ("\xD0")); // Incomplete. + assert (invalid_utf8 ("\n", codepoint_types::graphic)); // Invalid codepoint. + + // to_utf8() tests. + // + auto roundtrip = [] (const char* s) + { + string r (s); + to_utf8 (r, '?'); + return r == s; + }; + + auto sanitize = [] (string s, codepoint_types ts = codepoint_types::any) + { + to_utf8 (s, '?', ts); + return s; + }; + + // Empty. + // + assert (roundtrip ("")); + + // 1 code point. + // + assert (roundtrip ("a")); // 1 byte. + assert (roundtrip ("\xD0\xB0")); // 2 bytes. + assert (roundtrip ("\xE4\xBA\x8C")); // 3 bytes. + assert (roundtrip ("\xF0\x90\x8C\x82")); // 4 bytes. + + // Multiple code points. + // + assert (roundtrip ("a\xD0\xB0\xE4\xBA\x8C\xF0\x90\x8C\x82")); + + // Ill-formed sequences. + // + // Long sequence. + // + assert (sanitize ("\xF8") == "?"); // 5-byte sequence. + + // Invalid first byte followed by a second byte which ... + // + assert (sanitize ("\xC1\x80") == "??"); // is a trailing byte. + assert (sanitize ("\xC1y") == "?y"); // starts 1-byte sequence. + assert (sanitize ("\xC1\xD0\xB0") == "?\xD0\xB0"); // starts 2-byte sequence. + assert (sanitize ("\xC1\xFE") == "??"); // is not UTF-8. + + // Invalid second byte which ... + // + assert (sanitize ("\xD0y") == "?y"); // starts 1-byte sequence. + assert (sanitize ("\xD0\xD0\xB0") == "?\xD0\xB0"); // starts 2-byte sequence. + assert (sanitize ("\xD0\xFE") == "??"); // is not UTF-8. + + // Incomplete sequences. + // + assert (sanitize ("\xD0") == "?"); // 2-byte sequence. + assert (sanitize ("y\xD0") == "y?"); // 2-byte sequence. + assert (sanitize ("\xE4\xBA") == "??"); // 3-byte sequence. + assert (sanitize ("\xD0\xD0") == "??"); // 2-byte sequence. + + // Incomplete recovery. + // + assert (sanitize ("\xD0\xFE") == "??"); // 2-byte sequence. + assert (sanitize ("\xD0\xFE\xFE") == "???"); // 2-byte sequence. + + assert (sanitize ("\xF4\x90\x80\x80") == "????"); // Above the range. + assert (sanitize ("\xED\xA0\x80") == "???"); // Min UTF-16 surrogate. + assert (sanitize ("\xED\xBF\xBF") == "???"); // Max UTF-16 surrogate. + + // Invalid codepoints. + // + auto sanitize_g = [&sanitize] (string s) + { + return sanitize (move (s), codepoint_types::graphic); + }; + + assert (sanitize_g ("\xEF\xB7\x90") == "?"); + assert (sanitize_g ("y\xEF\xB7\x90") == "y?"); + assert (sanitize_g ("\xEF\xB7\x90y") == "?y"); + + // Invalid during recovery. + // + assert (sanitize_g ("\xD0\n") == "??"); + assert (sanitize_g ("\xD0\ny") == "??y"); + assert (sanitize_g ("\xD0\xFE\n") == "???"); + + assert (sanitize_g ("\xD0\xEF\xB7\x90") == "??"); + + // utf8_validator::codepoint() tests. + // + { + u32string r; + size_t invalid_codepoints (0); + + string s ("a" + "\xD0\xB0" + "\n" // Control. + "\xE4\xBA\x8C" + "\xEE\x80\x80" // Private-use. + "\xF0\x90\x8C\x82"); + + utf8_validator val (codepoint_types::graphic); + + for (char c: s) + { + pair v (val.validate (c)); + + if (v.first) + { + if (v.second) + r.push_back (val.codepoint ()); + } + else + ++invalid_codepoints; + } + + assert (r == U"a\x430\x4E8C\x10302"); + assert (invalid_codepoints == 2); + } } -- cgit v1.1