From 5ae9686adac1508873f2d980e84becd3496244c2 Mon Sep 17 00:00:00 2001 From: Karen Arutyunov Date: Wed, 26 Feb 2020 17:16:45 +0300 Subject: Add notion of validator to char_scanner and make sure manifest is UTF-8 This involves implementing utf8_validator and UTF-8 utility functions and using them during the manifest parsing, serialization, and rewriting. --- tests/manifest-parser/driver.cxx | 40 ++++++- tests/manifest-rewriter/driver.cxx | 4 + tests/manifest-serializer/driver.cxx | 18 +++ tests/utf8/driver.cxx | 207 +++++++++++++++++++++++++++++++++-- 4 files changed, 257 insertions(+), 12 deletions(-) (limited to 'tests') diff --git a/tests/manifest-parser/driver.cxx b/tests/manifest-parser/driver.cxx index 57674cb..a34f2b7 100644 --- a/tests/manifest-parser/driver.cxx +++ b/tests/manifest-parser/driver.cxx @@ -40,6 +40,9 @@ namespace butl static bool equal (const optional& actual, const optional& expected); + static pairs + parse (const char* m, manifest_parser::filter_function f = {}); + // Test manifest as it is represented in the stream, including format // version and end-of-manifest values. // @@ -188,6 +191,41 @@ namespace butl assert (p.first == "" && p.second == "comment"); } + // UTF-8. + // + assert (test (":1\n#\xD0\xB0\n\xD0\xB0y\xD0\xB0:\xD0\xB0z\xD0\xB0", + {{"","1"}, + {"\xD0\xB0y\xD0\xB0", "\xD0\xB0z\xD0\xB0"}, + {"",""}, + {"",""}})); + + assert (fail (":1\n#\xD0\n\xD0\xB0y\xD0\xB0:\xD0\xB0z\xD0\xB0")); + assert (fail (":1\n#\xD0\xB0\n\xB0y\xD0\xB0:\xD0\xB0z\xD0\xB0")); + assert (fail (":1\n#\xD0\xB0\n\xD0y\xD0\xB0:\xD0\xB0z\xD0\xB0")); + assert (fail (":1\n#\xD0\xB0\n\xD0\xB0y\xD0:\xD0\xB0z\xD0\xB0")); + assert (fail (":1\n#\xD0\xB0\n\xD0\xB0y\xD0\xB0:\xD0z\xD0\xB0")); + assert (fail (":1\n#\xD0\xB0\n\xD0\xB0y\xD0\xB0:\xD0\xB0z\xD0")); + assert (fail (":1\r\r\xB0#\n\xD0\xB0y\xD0\xB0:\xD0\xB0z\xD0\xB0")); + assert (fail (":1\r\xD0#\n\xD0\xB0y\xD0\xB0:\xD0\xB0z\xD0\xB0")); + assert (fail (":1\n#\xD0\xB0\n\xD0\xB0y\xD0\xB0:\xD0\xB0z\xD0\xB0\r\xD0")); + + // Test parsing failure for manifest with multi-byte UTF-8 sequences + // (the column is properly reported, etc). + // + try + { + parse (":1\na\xD0\xB0\xD0\xB0\xFE"); + assert (false); + } + catch (const manifest_parsing& e) + { + assert (e.line == 2 && + e.column == 4 && + e.description == + "invalid manifest name: " + "invalid UTF-8 sequence first byte (0xFE)"); + } + // Filtering. // assert (test (":1\na: abc\nb: bca\nc: cab", @@ -281,7 +319,7 @@ namespace butl } static pairs - parse (const char* m, manifest_parser::filter_function f = {}) + parse (const char* m, manifest_parser::filter_function f) { istringstream is (m); is.exceptions (istream::failbit | istream::badbit); diff --git a/tests/manifest-rewriter/driver.cxx b/tests/manifest-rewriter/driver.cxx index fd76929..ec73d81 100644 --- a/tests/manifest-rewriter/driver.cxx +++ b/tests/manifest-rewriter/driver.cxx @@ -90,6 +90,10 @@ namespace butl {{"abc", "xyz"}}) == ":1\n abc: \\\nxyz\n\\"); + assert (edit (":1\n a\xD0\xB0g : b", + {{"a\xD0\xB0g", "xyz"}}) == + ":1\n a\xD0\xB0g : \\\nxyz\n\\"); + // Test editing of manifests that contains CR characters. // assert (edit (":1\r\na: b\r\r\n", {{"a", "xyz"}}) == ":1\r\na: xyz\r\r\n"); diff --git a/tests/manifest-serializer/driver.cxx b/tests/manifest-serializer/driver.cxx index 148a281..c818b4a 100644 --- a/tests/manifest-serializer/driver.cxx +++ b/tests/manifest-serializer/driver.cxx @@ -46,6 +46,7 @@ main () assert (test ({{"#", "x"}}, "# x\n")); assert (test ({{"#", "x"},{"#", "y"},{"#", ""}}, "# x\n# y\n#\n")); assert (fail ({{"",""},{"#", "x"}})); // serialization after eos + assert (fail ({{"#", "\xB0"}})); // invalid UTF-8 sequence // Empty manifest stream. // @@ -89,6 +90,12 @@ main () assert (fail ({{"","1"},{"a b",""}})); assert (fail ({{"","1"},{"a\tb",""}})); assert (fail ({{"","1"},{"a\n",""}})); + assert (fail ({{"","1"},{"a\xB0",""}})); // invalid UTF-8 sequence + + // Invalid value. + // + assert (fail ({{"","1"},{"a","\xB0"}})); // invalid UTF-8 sequence + assert (fail ({{"","1"},{"a","\xD0"}})); // incomplete UTF-8 sequence // Simple value. // @@ -172,11 +179,22 @@ main () "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\\Y\\\n" "yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy"); + // Hard break after the UTF-8/delayed hard break. + // + string l6 ("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" + "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\xF0\x90\x8C\x82" + "\xF0\x90\x8C\x82yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy"); + + string e6 ("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" + "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\xF0\x90\x8C\x82\\\n" + "\xF0\x90\x8C\x82yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy"); + assert (test ({{"","1"},{"a",l1},{"",""},{"",""}}, ": 1\na: " + e1 + "\n")); assert (test ({{"","1"},{"a",l2},{"",""},{"",""}}, ": 1\na: " + e2 + "\n")); assert (test ({{"","1"},{"a",l3},{"",""},{"",""}}, ": 1\na: " + e3 + "\n")); assert (test ({{"","1"},{"a",l4},{"",""},{"",""}}, ": 1\na: " + e4 + "\n")); assert (test ({{"","1"},{"a",l5},{"",""},{"",""}}, ": 1\na: " + e5 + "\n")); + assert (test ({{"","1"},{"a",l6},{"",""},{"",""}}, ": 1\na: " + e6 + "\n")); // Multi-line value. // diff --git a/tests/utf8/driver.cxx b/tests/utf8/driver.cxx index 8480dec..f35e65e 100644 --- a/tests/utf8/driver.cxx +++ b/tests/utf8/driver.cxx @@ -13,8 +13,10 @@ #ifdef __cpp_lib_modules_ts import std.core; #endif +import butl.utf8; import butl.utility; #else +#include #include #endif @@ -24,6 +26,17 @@ using namespace butl; int main () { + // utf8() tests. + // + auto utf8_error = [] (const string& s, + codepoint_types ts = codepoint_types::any, + const char32_t* wl = nullptr) + { + string error; + assert (!utf8 (s, error, ts, wl)); + return error; + }; + // Valid sequences. // // Empty. @@ -43,18 +56,36 @@ main () // Ill-formed sequences. // + // Long sequences. + // + assert (!utf8 ("\xF8")); // 5-byte sequence. + assert (!utf8 ("\xFC")); // 6-byte sequence. + + assert (utf8_error ("\xF8") == "5-byte length UTF-8 sequence"); + assert (utf8_error ("\xFC") == "6-byte length UTF-8 sequence"); + assert (utf8_error ("\xFE") == "invalid UTF-8 sequence first byte (0xFE)"); + // 2-byte sequences. // assert (!utf8 ("\xC1\x80")); // Invalid first byte. assert (!utf8 ("\xD0y")); // Invalid second byte. + assert (utf8_error ("\xC1\x80") == + "invalid UTF-8 sequence first byte (0xC1)"); + + assert (utf8_error ("\xD0y") == + "invalid UTF-8 sequence second byte (0x79 'y')"); + // 3-byte sequences. // assert (!utf8 ("\xE2\x70\x80")); // Invalid second byte. assert (!utf8 ("\xE2\x80\x70")); // Invalid third byte. - assert (!utf8 ("\xED\xA0\x80")); // Min UTF-16 surrogate value. - assert (!utf8 ("\xED\xBF\xBF")); // Max UTF-16 surrogate value. + assert (!utf8 ("\xED\xA0\x80")); // Min UTF-16 surrogate. + assert (!utf8 ("\xED\xBF\xBF")); // Max UTF-16 surrogate. + + assert (utf8_error ("\xE2\x80\x70") == + "invalid UTF-8 sequence third byte (0x70 'p')"); // 4-byte sequences. // @@ -63,9 +94,8 @@ main () assert (!utf8 ("\xF0\x90\x70\x80")); // Invalid third byte. assert (!utf8 ("\xF1\x80\x80\xC0")); // Invalid forth byte. - // Out of the codepoint range (0x10ffff + 1). - // - assert (!utf8 ("\xF4\x90\x80\x80")); + assert (utf8_error ("\xF1\x80\x80\xC0") == + "invalid UTF-8 sequence forth byte (0xC0)"); // Incomplete sequences. // @@ -73,14 +103,25 @@ main () assert (!utf8 ("\xE4\xBA")); // 3-byte sequence. assert (!utf8 ("\xF0\x90\x8C")); // 4-byte sequence. + assert (utf8_error ("\xD0") == "incomplete UTF-8 sequence"); + // Missing sequence leading bytes. // - assert (!utf8 ("\xB0xyz")); // 2-byte sequence. - assert (!utf8 ("\xBA\x8Cxyz")); // 3-byte sequence. - assert (!utf8 ("\x8Cxyz")); // 3-byte sequence. - assert (!utf8 ("\x90\x8C\x82xyz")); // 4-byte sequence. - assert (!utf8 ("\x8C\x82xyz")); // 4-byte sequence. - assert (!utf8 ("\x82xyz")); // 4-byte sequence. + assert (!utf8 ("\xB0xyz")); // 2-byte sequence. + assert (!utf8 ("\xBA\x8C\xD0\xB0yz")); // 3-byte sequence. + assert (!utf8 ("\x8Cxyz")); // 3-byte sequence. + assert (!utf8 ("\x90\x8C\x82xyz")); // 4-byte sequence. + assert (!utf8 ("\x8C\x82xyz")); // 4-byte sequence. + assert (!utf8 ("\x82xyz")); // 4-byte sequence. + + assert (utf8_error ("\xB0") == "invalid UTF-8 sequence first byte (0xB0)"); + + // Above the valid codepoint range (0x10ffff + 1). + // + assert (!utf8 ("\xF4\x90\x80\x80")); + + assert (utf8_error ("\xF4\x90\x80\x80") == + "invalid UTF-8 sequence second byte (0x90)"); // Whitelisting. // @@ -145,6 +186,9 @@ main () assert (!utf8 ("\xF3\xA1\x80\x80", codepoint_types::graphic)); // Reserved. assert (!utf8 ("\xF3\xA0\x81\xBF", codepoint_types::graphic)); // Format. + assert (utf8_error ("\xF3\xA0\x81\xBF", codepoint_types::graphic) == + "invalid Unicode codepoint (format)"); + assert (!utf8 ("\xC2\xAC", codepoint_types::format)); // Graphic. // Private-use & Graphic. @@ -153,4 +197,145 @@ main () codepoint_types::format)); assert (!utf8 ("a", codepoint_types::none)); // None. + + assert (utf8_error ("a", codepoint_types::none) == + "invalid Unicode codepoint (graphic)"); + + // UTF-8 string length. + // + auto invalid_utf8 = [] (string s, codepoint_types ts = codepoint_types::any) + { + try + { + utf8_length (s, ts); + return false; + } + catch (const invalid_argument&) + { + return true; + } + }; + + assert (utf8_length ("") == 0); + assert (utf8_length ("x\xD0\xB0\xE4\xBA\x8C\xF0\x90\x8C\x82y") == 5); + + assert (invalid_utf8 ("\xFE")); // Invalid byte. + assert (invalid_utf8 ("\xD0")); // Incomplete. + assert (invalid_utf8 ("\n", codepoint_types::graphic)); // Invalid codepoint. + + // to_utf8() tests. + // + auto roundtrip = [] (const char* s) + { + string r (s); + to_utf8 (r, '?'); + return r == s; + }; + + auto sanitize = [] (string s, codepoint_types ts = codepoint_types::any) + { + to_utf8 (s, '?', ts); + return s; + }; + + // Empty. + // + assert (roundtrip ("")); + + // 1 code point. + // + assert (roundtrip ("a")); // 1 byte. + assert (roundtrip ("\xD0\xB0")); // 2 bytes. + assert (roundtrip ("\xE4\xBA\x8C")); // 3 bytes. + assert (roundtrip ("\xF0\x90\x8C\x82")); // 4 bytes. + + // Multiple code points. + // + assert (roundtrip ("a\xD0\xB0\xE4\xBA\x8C\xF0\x90\x8C\x82")); + + // Ill-formed sequences. + // + // Long sequence. + // + assert (sanitize ("\xF8") == "?"); // 5-byte sequence. + + // Invalid first byte followed by a second byte which ... + // + assert (sanitize ("\xC1\x80") == "??"); // is a trailing byte. + assert (sanitize ("\xC1y") == "?y"); // starts 1-byte sequence. + assert (sanitize ("\xC1\xD0\xB0") == "?\xD0\xB0"); // starts 2-byte sequence. + assert (sanitize ("\xC1\xFE") == "??"); // is not UTF-8. + + // Invalid second byte which ... + // + assert (sanitize ("\xD0y") == "?y"); // starts 1-byte sequence. + assert (sanitize ("\xD0\xD0\xB0") == "?\xD0\xB0"); // starts 2-byte sequence. + assert (sanitize ("\xD0\xFE") == "??"); // is not UTF-8. + + // Incomplete sequences. + // + assert (sanitize ("\xD0") == "?"); // 2-byte sequence. + assert (sanitize ("y\xD0") == "y?"); // 2-byte sequence. + assert (sanitize ("\xE4\xBA") == "??"); // 3-byte sequence. + assert (sanitize ("\xD0\xD0") == "??"); // 2-byte sequence. + + // Incomplete recovery. + // + assert (sanitize ("\xD0\xFE") == "??"); // 2-byte sequence. + assert (sanitize ("\xD0\xFE\xFE") == "???"); // 2-byte sequence. + + assert (sanitize ("\xF4\x90\x80\x80") == "????"); // Above the range. + assert (sanitize ("\xED\xA0\x80") == "???"); // Min UTF-16 surrogate. + assert (sanitize ("\xED\xBF\xBF") == "???"); // Max UTF-16 surrogate. + + // Invalid codepoints. + // + auto sanitize_g = [&sanitize] (string s) + { + return sanitize (move (s), codepoint_types::graphic); + }; + + assert (sanitize_g ("\xEF\xB7\x90") == "?"); + assert (sanitize_g ("y\xEF\xB7\x90") == "y?"); + assert (sanitize_g ("\xEF\xB7\x90y") == "?y"); + + // Invalid during recovery. + // + assert (sanitize_g ("\xD0\n") == "??"); + assert (sanitize_g ("\xD0\ny") == "??y"); + assert (sanitize_g ("\xD0\xFE\n") == "???"); + + assert (sanitize_g ("\xD0\xEF\xB7\x90") == "??"); + + // utf8_validator::codepoint() tests. + // + { + u32string r; + size_t invalid_codepoints (0); + + string s ("a" + "\xD0\xB0" + "\n" // Control. + "\xE4\xBA\x8C" + "\xEE\x80\x80" // Private-use. + "\xF0\x90\x8C\x82"); + + utf8_validator val (codepoint_types::graphic); + + for (char c: s) + { + pair v (val.validate (c)); + + if (v.first) + { + if (v.second) + r.push_back (val.codepoint ()); + } + else + ++invalid_codepoints; + } + + assert (r == U"a\x430\x4E8C\x10302"); + assert (invalid_codepoints == 2); + } } -- cgit v1.1