aboutsummaryrefslogtreecommitdiff
path: root/tests
diff options
context:
space:
mode:
authorKaren Arutyunov <karen@codesynthesis.com>2020-02-26 17:16:45 +0300
committerKaren Arutyunov <karen@codesynthesis.com>2020-02-26 17:17:49 +0300
commit5ae9686adac1508873f2d980e84becd3496244c2 (patch)
treed7c88e678b29ed6bb7ae30b74bd01aa2b5d2e9a8 /tests
parentafb726d2d59b3715960a8647738860f40e37cf4f (diff)
Add notion of validator to char_scanner and make sure manifest is UTF-8
This involves implementing utf8_validator and UTF-8 utility functions and using them during the manifest parsing, serialization, and rewriting.
Diffstat (limited to 'tests')
-rw-r--r--tests/manifest-parser/driver.cxx40
-rw-r--r--tests/manifest-rewriter/driver.cxx4
-rw-r--r--tests/manifest-serializer/driver.cxx18
-rw-r--r--tests/utf8/driver.cxx207
4 files changed, 257 insertions, 12 deletions
diff --git a/tests/manifest-parser/driver.cxx b/tests/manifest-parser/driver.cxx
index 57674cb..a34f2b7 100644
--- a/tests/manifest-parser/driver.cxx
+++ b/tests/manifest-parser/driver.cxx
@@ -40,6 +40,9 @@ namespace butl
static bool
equal (const optional<pairs>& actual, const optional<pairs>& expected);
+ static pairs
+ parse (const char* m, manifest_parser::filter_function f = {});
+
// Test manifest as it is represented in the stream, including format
// version and end-of-manifest values.
//
@@ -188,6 +191,41 @@ namespace butl
assert (p.first == "" && p.second == "comment");
}
+ // UTF-8.
+ //
+ assert (test (":1\n#\xD0\xB0\n\xD0\xB0y\xD0\xB0:\xD0\xB0z\xD0\xB0",
+ {{"","1"},
+ {"\xD0\xB0y\xD0\xB0", "\xD0\xB0z\xD0\xB0"},
+ {"",""},
+ {"",""}}));
+
+ assert (fail (":1\n#\xD0\n\xD0\xB0y\xD0\xB0:\xD0\xB0z\xD0\xB0"));
+ assert (fail (":1\n#\xD0\xB0\n\xB0y\xD0\xB0:\xD0\xB0z\xD0\xB0"));
+ assert (fail (":1\n#\xD0\xB0\n\xD0y\xD0\xB0:\xD0\xB0z\xD0\xB0"));
+ assert (fail (":1\n#\xD0\xB0\n\xD0\xB0y\xD0:\xD0\xB0z\xD0\xB0"));
+ assert (fail (":1\n#\xD0\xB0\n\xD0\xB0y\xD0\xB0:\xD0z\xD0\xB0"));
+ assert (fail (":1\n#\xD0\xB0\n\xD0\xB0y\xD0\xB0:\xD0\xB0z\xD0"));
+ assert (fail (":1\r\r\xB0#\n\xD0\xB0y\xD0\xB0:\xD0\xB0z\xD0\xB0"));
+ assert (fail (":1\r\xD0#\n\xD0\xB0y\xD0\xB0:\xD0\xB0z\xD0\xB0"));
+ assert (fail (":1\n#\xD0\xB0\n\xD0\xB0y\xD0\xB0:\xD0\xB0z\xD0\xB0\r\xD0"));
+
+ // Test parsing failure for manifest with multi-byte UTF-8 sequences
+ // (the column is properly reported, etc).
+ //
+ try
+ {
+ parse (":1\na\xD0\xB0\xD0\xB0\xFE");
+ assert (false);
+ }
+ catch (const manifest_parsing& e)
+ {
+ assert (e.line == 2 &&
+ e.column == 4 &&
+ e.description ==
+ "invalid manifest name: "
+ "invalid UTF-8 sequence first byte (0xFE)");
+ }
+
// Filtering.
//
assert (test (":1\na: abc\nb: bca\nc: cab",
@@ -281,7 +319,7 @@ namespace butl
}
static pairs
- parse (const char* m, manifest_parser::filter_function f = {})
+ parse (const char* m, manifest_parser::filter_function f)
{
istringstream is (m);
is.exceptions (istream::failbit | istream::badbit);
diff --git a/tests/manifest-rewriter/driver.cxx b/tests/manifest-rewriter/driver.cxx
index fd76929..ec73d81 100644
--- a/tests/manifest-rewriter/driver.cxx
+++ b/tests/manifest-rewriter/driver.cxx
@@ -90,6 +90,10 @@ namespace butl
{{"abc", "xyz"}}) ==
":1\n abc: \\\nxyz\n\\");
+ assert (edit (":1\n a\xD0\xB0g : b",
+ {{"a\xD0\xB0g", "xyz"}}) ==
+ ":1\n a\xD0\xB0g : \\\nxyz\n\\");
+
// Test editing of manifests that contains CR characters.
//
assert (edit (":1\r\na: b\r\r\n", {{"a", "xyz"}}) == ":1\r\na: xyz\r\r\n");
diff --git a/tests/manifest-serializer/driver.cxx b/tests/manifest-serializer/driver.cxx
index 148a281..c818b4a 100644
--- a/tests/manifest-serializer/driver.cxx
+++ b/tests/manifest-serializer/driver.cxx
@@ -46,6 +46,7 @@ main ()
assert (test ({{"#", "x"}}, "# x\n"));
assert (test ({{"#", "x"},{"#", "y"},{"#", ""}}, "# x\n# y\n#\n"));
assert (fail ({{"",""},{"#", "x"}})); // serialization after eos
+ assert (fail ({{"#", "\xB0"}})); // invalid UTF-8 sequence
// Empty manifest stream.
//
@@ -89,6 +90,12 @@ main ()
assert (fail ({{"","1"},{"a b",""}}));
assert (fail ({{"","1"},{"a\tb",""}}));
assert (fail ({{"","1"},{"a\n",""}}));
+ assert (fail ({{"","1"},{"a\xB0",""}})); // invalid UTF-8 sequence
+
+ // Invalid value.
+ //
+ assert (fail ({{"","1"},{"a","\xB0"}})); // invalid UTF-8 sequence
+ assert (fail ({{"","1"},{"a","\xD0"}})); // incomplete UTF-8 sequence
// Simple value.
//
@@ -172,11 +179,22 @@ main ()
"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\\Y\\\n"
"yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy");
+ // Hard break after the UTF-8/delayed hard break.
+ //
+ string l6 ("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\xF0\x90\x8C\x82"
+ "\xF0\x90\x8C\x82yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy");
+
+ string e6 ("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
+ "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\xF0\x90\x8C\x82\\\n"
+ "\xF0\x90\x8C\x82yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy");
+
assert (test ({{"","1"},{"a",l1},{"",""},{"",""}}, ": 1\na: " + e1 + "\n"));
assert (test ({{"","1"},{"a",l2},{"",""},{"",""}}, ": 1\na: " + e2 + "\n"));
assert (test ({{"","1"},{"a",l3},{"",""},{"",""}}, ": 1\na: " + e3 + "\n"));
assert (test ({{"","1"},{"a",l4},{"",""},{"",""}}, ": 1\na: " + e4 + "\n"));
assert (test ({{"","1"},{"a",l5},{"",""},{"",""}}, ": 1\na: " + e5 + "\n"));
+ assert (test ({{"","1"},{"a",l6},{"",""},{"",""}}, ": 1\na: " + e6 + "\n"));
// Multi-line value.
//
diff --git a/tests/utf8/driver.cxx b/tests/utf8/driver.cxx
index 8480dec..f35e65e 100644
--- a/tests/utf8/driver.cxx
+++ b/tests/utf8/driver.cxx
@@ -13,8 +13,10 @@
#ifdef __cpp_lib_modules_ts
import std.core;
#endif
+import butl.utf8;
import butl.utility;
#else
+#include <libbutl/utf8.mxx>
#include <libbutl/utility.mxx>
#endif
@@ -24,6 +26,17 @@ using namespace butl;
int
main ()
{
+ // utf8() tests.
+ //
+ auto utf8_error = [] (const string& s,
+ codepoint_types ts = codepoint_types::any,
+ const char32_t* wl = nullptr)
+ {
+ string error;
+ assert (!utf8 (s, error, ts, wl));
+ return error;
+ };
+
// Valid sequences.
//
// Empty.
@@ -43,18 +56,36 @@ main ()
// Ill-formed sequences.
//
+ // Long sequences.
+ //
+ assert (!utf8 ("\xF8")); // 5-byte sequence.
+ assert (!utf8 ("\xFC")); // 6-byte sequence.
+
+ assert (utf8_error ("\xF8") == "5-byte length UTF-8 sequence");
+ assert (utf8_error ("\xFC") == "6-byte length UTF-8 sequence");
+ assert (utf8_error ("\xFE") == "invalid UTF-8 sequence first byte (0xFE)");
+
// 2-byte sequences.
//
assert (!utf8 ("\xC1\x80")); // Invalid first byte.
assert (!utf8 ("\xD0y")); // Invalid second byte.
+ assert (utf8_error ("\xC1\x80") ==
+ "invalid UTF-8 sequence first byte (0xC1)");
+
+ assert (utf8_error ("\xD0y") ==
+ "invalid UTF-8 sequence second byte (0x79 'y')");
+
// 3-byte sequences.
//
assert (!utf8 ("\xE2\x70\x80")); // Invalid second byte.
assert (!utf8 ("\xE2\x80\x70")); // Invalid third byte.
- assert (!utf8 ("\xED\xA0\x80")); // Min UTF-16 surrogate value.
- assert (!utf8 ("\xED\xBF\xBF")); // Max UTF-16 surrogate value.
+ assert (!utf8 ("\xED\xA0\x80")); // Min UTF-16 surrogate.
+ assert (!utf8 ("\xED\xBF\xBF")); // Max UTF-16 surrogate.
+
+ assert (utf8_error ("\xE2\x80\x70") ==
+ "invalid UTF-8 sequence third byte (0x70 'p')");
// 4-byte sequences.
//
@@ -63,9 +94,8 @@ main ()
assert (!utf8 ("\xF0\x90\x70\x80")); // Invalid third byte.
assert (!utf8 ("\xF1\x80\x80\xC0")); // Invalid forth byte.
- // Out of the codepoint range (0x10ffff + 1).
- //
- assert (!utf8 ("\xF4\x90\x80\x80"));
+ assert (utf8_error ("\xF1\x80\x80\xC0") ==
+ "invalid UTF-8 sequence forth byte (0xC0)");
// Incomplete sequences.
//
@@ -73,14 +103,25 @@ main ()
assert (!utf8 ("\xE4\xBA")); // 3-byte sequence.
assert (!utf8 ("\xF0\x90\x8C")); // 4-byte sequence.
+ assert (utf8_error ("\xD0") == "incomplete UTF-8 sequence");
+
// Missing sequence leading bytes.
//
- assert (!utf8 ("\xB0xyz")); // 2-byte sequence.
- assert (!utf8 ("\xBA\x8Cxyz")); // 3-byte sequence.
- assert (!utf8 ("\x8Cxyz")); // 3-byte sequence.
- assert (!utf8 ("\x90\x8C\x82xyz")); // 4-byte sequence.
- assert (!utf8 ("\x8C\x82xyz")); // 4-byte sequence.
- assert (!utf8 ("\x82xyz")); // 4-byte sequence.
+ assert (!utf8 ("\xB0xyz")); // 2-byte sequence.
+ assert (!utf8 ("\xBA\x8C\xD0\xB0yz")); // 3-byte sequence.
+ assert (!utf8 ("\x8Cxyz")); // 3-byte sequence.
+ assert (!utf8 ("\x90\x8C\x82xyz")); // 4-byte sequence.
+ assert (!utf8 ("\x8C\x82xyz")); // 4-byte sequence.
+ assert (!utf8 ("\x82xyz")); // 4-byte sequence.
+
+ assert (utf8_error ("\xB0") == "invalid UTF-8 sequence first byte (0xB0)");
+
+ // Above the valid codepoint range (0x10ffff + 1).
+ //
+ assert (!utf8 ("\xF4\x90\x80\x80"));
+
+ assert (utf8_error ("\xF4\x90\x80\x80") ==
+ "invalid UTF-8 sequence second byte (0x90)");
// Whitelisting.
//
@@ -145,6 +186,9 @@ main ()
assert (!utf8 ("\xF3\xA1\x80\x80", codepoint_types::graphic)); // Reserved.
assert (!utf8 ("\xF3\xA0\x81\xBF", codepoint_types::graphic)); // Format.
+ assert (utf8_error ("\xF3\xA0\x81\xBF", codepoint_types::graphic) ==
+ "invalid Unicode codepoint (format)");
+
assert (!utf8 ("\xC2\xAC", codepoint_types::format)); // Graphic.
// Private-use & Graphic.
@@ -153,4 +197,145 @@ main ()
codepoint_types::format));
assert (!utf8 ("a", codepoint_types::none)); // None.
+
+ assert (utf8_error ("a", codepoint_types::none) ==
+ "invalid Unicode codepoint (graphic)");
+
+ // UTF-8 string length.
+ //
+ auto invalid_utf8 = [] (string s, codepoint_types ts = codepoint_types::any)
+ {
+ try
+ {
+ utf8_length (s, ts);
+ return false;
+ }
+ catch (const invalid_argument&)
+ {
+ return true;
+ }
+ };
+
+ assert (utf8_length ("") == 0);
+ assert (utf8_length ("x\xD0\xB0\xE4\xBA\x8C\xF0\x90\x8C\x82y") == 5);
+
+ assert (invalid_utf8 ("\xFE")); // Invalid byte.
+ assert (invalid_utf8 ("\xD0")); // Incomplete.
+ assert (invalid_utf8 ("\n", codepoint_types::graphic)); // Invalid codepoint.
+
+ // to_utf8() tests.
+ //
+ auto roundtrip = [] (const char* s)
+ {
+ string r (s);
+ to_utf8 (r, '?');
+ return r == s;
+ };
+
+ auto sanitize = [] (string s, codepoint_types ts = codepoint_types::any)
+ {
+ to_utf8 (s, '?', ts);
+ return s;
+ };
+
+ // Empty.
+ //
+ assert (roundtrip (""));
+
+ // 1 code point.
+ //
+ assert (roundtrip ("a")); // 1 byte.
+ assert (roundtrip ("\xD0\xB0")); // 2 bytes.
+ assert (roundtrip ("\xE4\xBA\x8C")); // 3 bytes.
+ assert (roundtrip ("\xF0\x90\x8C\x82")); // 4 bytes.
+
+ // Multiple code points.
+ //
+ assert (roundtrip ("a\xD0\xB0\xE4\xBA\x8C\xF0\x90\x8C\x82"));
+
+ // Ill-formed sequences.
+ //
+ // Long sequence.
+ //
+ assert (sanitize ("\xF8") == "?"); // 5-byte sequence.
+
+ // Invalid first byte followed by a second byte which ...
+ //
+ assert (sanitize ("\xC1\x80") == "??"); // is a trailing byte.
+ assert (sanitize ("\xC1y") == "?y"); // starts 1-byte sequence.
+ assert (sanitize ("\xC1\xD0\xB0") == "?\xD0\xB0"); // starts 2-byte sequence.
+ assert (sanitize ("\xC1\xFE") == "??"); // is not UTF-8.
+
+ // Invalid second byte which ...
+ //
+ assert (sanitize ("\xD0y") == "?y"); // starts 1-byte sequence.
+ assert (sanitize ("\xD0\xD0\xB0") == "?\xD0\xB0"); // starts 2-byte sequence.
+ assert (sanitize ("\xD0\xFE") == "??"); // is not UTF-8.
+
+ // Incomplete sequences.
+ //
+ assert (sanitize ("\xD0") == "?"); // 2-byte sequence.
+ assert (sanitize ("y\xD0") == "y?"); // 2-byte sequence.
+ assert (sanitize ("\xE4\xBA") == "??"); // 3-byte sequence.
+ assert (sanitize ("\xD0\xD0") == "??"); // 2-byte sequence.
+
+ // Incomplete recovery.
+ //
+ assert (sanitize ("\xD0\xFE") == "??"); // 2-byte sequence.
+ assert (sanitize ("\xD0\xFE\xFE") == "???"); // 2-byte sequence.
+
+ assert (sanitize ("\xF4\x90\x80\x80") == "????"); // Above the range.
+ assert (sanitize ("\xED\xA0\x80") == "???"); // Min UTF-16 surrogate.
+ assert (sanitize ("\xED\xBF\xBF") == "???"); // Max UTF-16 surrogate.
+
+ // Invalid codepoints.
+ //
+ auto sanitize_g = [&sanitize] (string s)
+ {
+ return sanitize (move (s), codepoint_types::graphic);
+ };
+
+ assert (sanitize_g ("\xEF\xB7\x90") == "?");
+ assert (sanitize_g ("y\xEF\xB7\x90") == "y?");
+ assert (sanitize_g ("\xEF\xB7\x90y") == "?y");
+
+ // Invalid during recovery.
+ //
+ assert (sanitize_g ("\xD0\n") == "??");
+ assert (sanitize_g ("\xD0\ny") == "??y");
+ assert (sanitize_g ("\xD0\xFE\n") == "???");
+
+ assert (sanitize_g ("\xD0\xEF\xB7\x90") == "??");
+
+ // utf8_validator::codepoint() tests.
+ //
+ {
+ u32string r;
+ size_t invalid_codepoints (0);
+
+ string s ("a"
+ "\xD0\xB0"
+ "\n" // Control.
+ "\xE4\xBA\x8C"
+ "\xEE\x80\x80" // Private-use.
+ "\xF0\x90\x8C\x82");
+
+ utf8_validator val (codepoint_types::graphic);
+
+ for (char c: s)
+ {
+ pair<bool, bool> v (val.validate (c));
+
+ if (v.first)
+ {
+ if (v.second)
+ r.push_back (val.codepoint ());
+ }
+ else
+ ++invalid_codepoints;
+ }
+
+ assert (r == U"a\x430\x4E8C\x10302");
+ assert (invalid_codepoints == 2);
+ }
}