From 5ae9686adac1508873f2d980e84becd3496244c2 Mon Sep 17 00:00:00 2001
From: Karen Arutyunov <karen@codesynthesis.com>
Date: Wed, 26 Feb 2020 17:16:45 +0300
Subject: Add notion of validator to char_scanner and make sure manifest is
 UTF-8

This involves implementing utf8_validator and UTF-8 utility functions and
using them during the manifest parsing, serialization, and rewriting.
---
 tests/utf8/driver.cxx | 207 +++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 196 insertions(+), 11 deletions(-)

(limited to 'tests/utf8')

diff --git a/tests/utf8/driver.cxx b/tests/utf8/driver.cxx
index 8480dec..f35e65e 100644
--- a/tests/utf8/driver.cxx
+++ b/tests/utf8/driver.cxx
@@ -13,8 +13,10 @@
 #ifdef __cpp_lib_modules_ts
 import std.core;
 #endif
+import butl.utf8;
 import butl.utility;
 #else
+#include <libbutl/utf8.mxx>
 #include <libbutl/utility.mxx>
 #endif
 
@@ -24,6 +26,17 @@ using namespace butl;
 int
 main ()
 {
+  // utf8() tests.
+  //
+  auto utf8_error = [] (const string& s,
+                        codepoint_types ts = codepoint_types::any,
+                        const char32_t* wl = nullptr)
+  {
+    string error;
+    assert (!utf8 (s, error, ts, wl));
+    return error;
+  };
+
   // Valid sequences.
   //
   // Empty.
@@ -43,18 +56,36 @@ main ()
 
   // Ill-formed sequences.
   //
+  // Long sequences.
+  //
+  assert (!utf8 ("\xF8")); // 5-byte sequence.
+  assert (!utf8 ("\xFC")); // 6-byte sequence.
+
+  assert (utf8_error ("\xF8") == "5-byte length UTF-8 sequence");
+  assert (utf8_error ("\xFC") == "6-byte length UTF-8 sequence");
+  assert (utf8_error ("\xFE") == "invalid UTF-8 sequence first byte (0xFE)");
+
   // 2-byte sequences.
   //
   assert (!utf8 ("\xC1\x80")); // Invalid first byte.
   assert (!utf8 ("\xD0y"));    // Invalid second byte.
 
+  assert (utf8_error ("\xC1\x80") ==
+          "invalid UTF-8 sequence first byte (0xC1)");
+
+  assert (utf8_error ("\xD0y") ==
+          "invalid UTF-8 sequence second byte (0x79 'y')");
+
   // 3-byte sequences.
   //
   assert (!utf8 ("\xE2\x70\x80")); // Invalid second byte.
   assert (!utf8 ("\xE2\x80\x70")); // Invalid third byte.
 
-  assert (!utf8 ("\xED\xA0\x80")); // Min UTF-16 surrogate value.
-  assert (!utf8 ("\xED\xBF\xBF")); // Max UTF-16 surrogate value.
+  assert (!utf8 ("\xED\xA0\x80")); // Min UTF-16 surrogate.
+  assert (!utf8 ("\xED\xBF\xBF")); // Max UTF-16 surrogate.
+
+  assert (utf8_error ("\xE2\x80\x70") ==
+          "invalid UTF-8 sequence third byte (0x70 'p')");
 
   // 4-byte sequences.
   //
@@ -63,9 +94,8 @@ main ()
   assert (!utf8 ("\xF0\x90\x70\x80")); // Invalid third byte.
   assert (!utf8 ("\xF1\x80\x80\xC0")); // Invalid forth byte.
 
-  // Out of the codepoint range (0x10ffff + 1).
-  //
-  assert (!utf8 ("\xF4\x90\x80\x80"));
+  assert (utf8_error ("\xF1\x80\x80\xC0") ==
+          "invalid UTF-8 sequence forth byte (0xC0)");
 
   // Incomplete sequences.
   //
@@ -73,14 +103,25 @@ main ()
   assert (!utf8 ("\xE4\xBA"));     // 3-byte sequence.
   assert (!utf8 ("\xF0\x90\x8C")); // 4-byte sequence.
 
+  assert (utf8_error ("\xD0") == "incomplete UTF-8 sequence");
+
   // Missing sequence leading bytes.
   //
-  assert (!utf8 ("\xB0xyz"));         // 2-byte sequence.
-  assert (!utf8 ("\xBA\x8Cxyz"));     // 3-byte sequence.
-  assert (!utf8 ("\x8Cxyz"));         // 3-byte sequence.
-  assert (!utf8 ("\x90\x8C\x82xyz")); // 4-byte sequence.
-  assert (!utf8 ("\x8C\x82xyz"));     // 4-byte sequence.
-  assert (!utf8 ("\x82xyz"));         // 4-byte sequence.
+  assert (!utf8 ("\xB0xyz"));            // 2-byte sequence.
+  assert (!utf8 ("\xBA\x8C\xD0\xB0yz")); // 3-byte sequence.
+  assert (!utf8 ("\x8Cxyz"));            // 3-byte sequence.
+  assert (!utf8 ("\x90\x8C\x82xyz"));    // 4-byte sequence.
+  assert (!utf8 ("\x8C\x82xyz"));        // 4-byte sequence.
+  assert (!utf8 ("\x82xyz"));            // 4-byte sequence.
+
+  assert (utf8_error ("\xB0") == "invalid UTF-8 sequence first byte (0xB0)");
+
+  // Above the valid codepoint range (0x10ffff + 1).
+  //
+  assert (!utf8 ("\xF4\x90\x80\x80"));
+
+  assert (utf8_error ("\xF4\x90\x80\x80") ==
+          "invalid UTF-8 sequence second byte (0x90)");
 
   // Whitelisting.
   //
@@ -145,6 +186,9 @@ main ()
   assert (!utf8 ("\xF3\xA1\x80\x80", codepoint_types::graphic)); // Reserved.
   assert (!utf8 ("\xF3\xA0\x81\xBF", codepoint_types::graphic)); // Format.
 
+  assert (utf8_error ("\xF3\xA0\x81\xBF", codepoint_types::graphic) ==
+          "invalid Unicode codepoint (format)");
+
   assert (!utf8 ("\xC2\xAC", codepoint_types::format)); // Graphic.
 
   // Private-use & Graphic.
@@ -153,4 +197,145 @@ main ()
                  codepoint_types::format));
 
   assert (!utf8 ("a", codepoint_types::none)); // None.
+
+  assert (utf8_error ("a", codepoint_types::none) ==
+          "invalid Unicode codepoint (graphic)");
+
+  // UTF-8 string length.
+  //
+  auto invalid_utf8 = [] (string s, codepoint_types ts = codepoint_types::any)
+  {
+    try
+    {
+      utf8_length (s, ts);
+      return false;
+    }
+    catch (const invalid_argument&)
+    {
+      return true;
+    }
+  };
+
+  assert (utf8_length ("") == 0);
+  assert (utf8_length ("x\xD0\xB0\xE4\xBA\x8C\xF0\x90\x8C\x82y") == 5);
+
+  assert (invalid_utf8 ("\xFE"));                         // Invalid byte.
+  assert (invalid_utf8 ("\xD0"));                         // Incomplete.
+  assert (invalid_utf8 ("\n", codepoint_types::graphic)); // Invalid codepoint.
+
+  // to_utf8() tests.
+  //
+  auto roundtrip = [] (const char* s)
+  {
+    string r (s);
+    to_utf8 (r, '?');
+    return r == s;
+  };
+
+  auto sanitize = [] (string s, codepoint_types ts = codepoint_types::any)
+  {
+    to_utf8 (s, '?', ts);
+    return s;
+  };
+
+  // Empty.
+  //
+  assert (roundtrip (""));
+
+  // 1 code point.
+  //
+  assert (roundtrip ("a"));                // 1 byte.
+  assert (roundtrip ("\xD0\xB0"));         // 2 bytes.
+  assert (roundtrip ("\xE4\xBA\x8C"));     // 3 bytes.
+  assert (roundtrip ("\xF0\x90\x8C\x82")); // 4 bytes.
+
+  // Multiple code points.
+  //
+  assert (roundtrip ("a\xD0\xB0\xE4\xBA\x8C\xF0\x90\x8C\x82"));
+
+  // Ill-formed sequences.
+  //
+  // Long sequence.
+  //
+  assert (sanitize ("\xF8") == "?"); // 5-byte sequence.
+
+  // Invalid first byte followed by a second byte which ...
+  //
+  assert (sanitize ("\xC1\x80")     == "??");        // is a trailing byte.
+  assert (sanitize ("\xC1y")        == "?y");        // starts 1-byte sequence.
+  assert (sanitize ("\xC1\xD0\xB0") == "?\xD0\xB0"); // starts 2-byte sequence.
+  assert (sanitize ("\xC1\xFE")     == "??");        // is not UTF-8.
+
+  // Invalid second byte which ...
+  //
+  assert (sanitize ("\xD0y")        == "?y");        // starts 1-byte sequence.
+  assert (sanitize ("\xD0\xD0\xB0") == "?\xD0\xB0"); // starts 2-byte sequence.
+  assert (sanitize ("\xD0\xFE")     == "??");        // is not UTF-8.
+
+  // Incomplete sequences.
+  //
+  assert (sanitize ("\xD0")     == "?");   // 2-byte sequence.
+  assert (sanitize ("y\xD0")    == "y?");  // 2-byte sequence.
+  assert (sanitize ("\xE4\xBA") == "??");  // 3-byte sequence.
+  assert (sanitize ("\xD0\xD0") == "??");  // 2-byte sequence.
+
+  // Incomplete recovery.
+  //
+  assert (sanitize ("\xD0\xFE")     == "??");  // 2-byte sequence.
+  assert (sanitize ("\xD0\xFE\xFE") == "???"); // 2-byte sequence.
+
+  assert (sanitize ("\xF4\x90\x80\x80") == "????"); // Above the range.
+  assert (sanitize ("\xED\xA0\x80")     == "???");  // Min UTF-16 surrogate.
+  assert (sanitize ("\xED\xBF\xBF")     == "???");  // Max UTF-16 surrogate.
+
+  // Invalid codepoints.
+  //
+  auto sanitize_g = [&sanitize] (string s)
+  {
+    return sanitize (move (s), codepoint_types::graphic);
+  };
+
+  assert (sanitize_g ("\xEF\xB7\x90")  == "?");
+  assert (sanitize_g ("y\xEF\xB7\x90") == "y?");
+  assert (sanitize_g ("\xEF\xB7\x90y") == "?y");
+
+  // Invalid during recovery.
+  //
+  assert (sanitize_g ("\xD0\n")     == "??");
+  assert (sanitize_g ("\xD0\ny")    == "??y");
+  assert (sanitize_g ("\xD0\xFE\n") == "???");
+
+  assert (sanitize_g ("\xD0\xEF\xB7\x90") == "??");
+
+  // utf8_validator::codepoint() tests.
+  //
+  {
+    u32string r;
+    size_t invalid_codepoints (0);
+
+    string s ("a"
+              "\xD0\xB0"
+              "\n"                 // Control.
+              "\xE4\xBA\x8C"
+              "\xEE\x80\x80"       // Private-use.
+              "\xF0\x90\x8C\x82");
+
+    utf8_validator val (codepoint_types::graphic);
+
+    for (char c: s)
+    {
+      pair<bool, bool> v (val.validate (c));
+
+      if (v.first)
+      {
+        if (v.second)
+          r.push_back (val.codepoint ());
+      }
+      else
+        ++invalid_codepoints;
+    }
+
+    assert (r == U"a\x430\x4E8C\x10302");
+    assert (invalid_codepoints == 2);
+  }
 }
-- 
cgit v1.1