1 files changed, 305 insertions, 0 deletions
diff --git a/libbutl/utf8.ixx b/libbutl/utf8.ixx
new file mode 100644
index 0000000..3d2e092
--- /dev/null
+++ b/libbutl/utf8.ixx
@@ -0,0 +1,305 @@
+// file      : libbutl/utf8.ixx -*- C++ -*-
+// license   : MIT; see accompanying LICENSE file
+
+namespace butl
+{
+  inline utf8_validator::
+  utf8_validator (codepoint_types ts, const char32_t* wl)
+      : types_ (ts),
+        whitelist_ (wl)
+  {
+  }
+
+  inline std::pair<bool, bool> utf8_validator::
+  validate (char c)
+  {
+    return validate (c, nullptr /* what */);
+  }
+
+  inline std::pair<bool, bool> utf8_validator::
+  validate (char c, std::string& what)
+  {
+    return validate (c, &what);
+  }
+
+  inline std::pair<bool, bool> utf8_validator::
+  validate (char c, std::string* what)
+  {
+    using namespace std;
+
+    // A UCS-4 character is encoded as the UTF-8 byte sequence as follows,
+    // depending on the value range it falls into:
+    //
+    // 0x00000000 - 0x0000007F:
+    //   0xxxxxxx
+    //
+    // 0x00000080 - 0x000007FF:
+    //   110xxxxx 10xxxxxx
+    //
+    // 0x00000800 - 0x0000FFFF:
+    //   1110xxxx 10xxxxxx 10xxxxxx
+    //
+    // 0x00010000 - 0x001FFFFF:
+    //   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+    //
+    // 0x00200000 - 0x03FFFFFF:
+    //   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+    //
+    // 0x04000000 - 0x7FFFFFFF:
+    //   1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+    //
+    // Also note that the Unicode Standard (as of 12.1) specifies no
+    // codepoints above 0x10FFFF, so we consider 5- and 6-byte length UTF-8
+    // sequences as invalid (we could have added `unspecified` codepoint type
+    // except that there are no UTF-8 validation tables defined for these
+    // sequences).
+    //
+    unsigned char b (c);
+
+    // Compose the detailed "invalid UTF-8 sequence byte" error.
+    //
+    auto byte_error = [c, b, this] ()
+    {
+      string s ("invalid UTF-8 sequence ");
+
+      const char* names[] = {"first", "second", "third", "forth"};
+      s += names[seq_index_];
+      s += " byte (0x";
+
+      const char digits[] = "0123456789ABCDEF";
+      s += digits[(b >> 4) & 0xF];
+      s += digits[b & 0xF];
+
+      // If the byte happens to be a printable ASCII character then let's
+      // print it as a character as well. This can help a bit with grepping
+      // through text while troubleshooting.
+      //
+      if (b >= 0x20 && b <= 0x7E)
+      {
+        s += " '";
+        s += c;
+        s += "'";
+      }
+
+      s += ")";
+      return s;
+    };
+
+    // Detect the byte sequence length based on its first byte. While at it,
+    // start calculating the resulting Unicode codepoint value.
+    //
+    if (seq_index_ == 0)
+    {
+      if (b < 0x80)
+      {
+        seq_size_ = 1;
+        codepoint_ = b;
+      }
+      else if (b < 0xE0)
+      {
+        seq_size_ = 2;
+        codepoint_ = b & 0x1F; // Takes 5 rightmost bits.
+      }
+      else if (b < 0xF0)
+      {
+        seq_size_ = 3;
+        codepoint_ = b & 0xF; // Takes 4 rightmost bits.
+      }
+      else if (b < 0xF8)
+      {
+        seq_size_ = 4;
+        codepoint_ = b & 0x7; // Takes 3 rightmost bits.
+      }
+      else
+      {
+        if (what != nullptr)
+        {
+          if (b < 0xFE)
+          {
+            *what  = b < 0xFC ? "5" : "6";
+            *what += "-byte length UTF-8 sequence";
+          }
+          else
+            *what = byte_error ();
+        }
+
+        return make_pair (false, false); // Invalid byte.
+      }
+    }
+
+    // Note that while a codepoint may potentially be encoded with byte
+    // sequences of different lengths, only the shortest encoding sequence is
+    // considered well-formed. Also a well-formed sequence may not be decoded
+    // into invalid codepoint value (see codepoint_type() for details). We
+    // will check all that using the Well-Formed UTF-8 Byte Sequences table
+    // (provided by the Unicode 12.0 Standard) which also takes care of the
+    // missing UTF-8 sequence bytes.
+    //
+    bool valid (false);
+
+    // Return true if a byte value belongs to the specified range.
+    //
+    auto belongs = [] (unsigned char c, unsigned char l, unsigned char r)
+    {
+      return c >= l && c <= r;
+    };
+
+    switch (seq_size_)
+    {
+    case 1: valid = true; break; // Well-formed by the definition (see above).
+    case 2:
+      {
+        // [000080 0007FF]: [C2 DF]  [80 BF]
+        //
+        // Check the first byte and set the second byte range.
+        //
+        if (seq_index_ == 0)
+        {
+          if ((valid = belongs (b, 0xC2, 0xDF)))
+            byte2_range_ = make_pair (0x80, 0xBF);
+        }
+        else // Check the second byte.
+          valid = belongs (b, byte2_range_.first, byte2_range_.second);
+
+        break;
+      }
+    case 3:
+      {
+        // [000800 000FFF]: E0       [A0 BF]  [80 BF]
+        // [001000 00CFFF]: [E1 EC]  [80 BF]  [80 BF]
+        // [00D000 00D7FF]: ED       [80 9F]  [80 BF] ; Excludes surrogates.
+        // [00E000 00FFFF]: [EE EF]  [80 BF]  [80 BF]
+        //
+        // Check the first byte and set the second byte range.
+        //
+        if (seq_index_ == 0)
+        {
+          if ((valid = (b == 0xE0)))
+            byte2_range_ = make_pair (0xA0, 0xBF);
+          else if ((valid = belongs (b, 0xE1, 0xEC)))
+            byte2_range_ = make_pair (0x80, 0xBF);
+          else if ((valid = (b == 0xED)))
+            byte2_range_ = make_pair (0x80, 0x9F);
+          else if ((valid = belongs (b, 0xEE, 0xEF)))
+            byte2_range_ = make_pair (0x80, 0xBF);
+        }
+        else if (seq_index_ == 1) // Check the second byte.
+          valid = belongs (b, byte2_range_.first, byte2_range_.second);
+        else                      // Check the third byte.
+          valid = belongs (b, 0x80, 0xBF);
+
+        break;
+      }
+    case 4:
+      {
+        // [010000 03FFFF]: F0       [90 BF]  [80 BF]  [80 BF]
+        // [040000 0FFFFF]: [F1 F3]  [80 BF]  [80 BF]  [80 BF]
+        // [100000 10FFFF]: F4       [80 8F]  [80 BF]  [80 BF]
+        //
+        // Check the first byte and set the second byte range.
+        //
+        if (seq_index_ == 0)
+        {
+          if ((valid = (b == 0xF0)))
+            byte2_range_ = make_pair (0x90, 0xBF);
+          else if ((valid = belongs (b, 0xF1, 0xF3)))
+            byte2_range_ = make_pair (0x80, 0xBF);
+          else if ((valid = (b == 0xF4)))
+            byte2_range_ = make_pair (0x80, 0x8F);
+        }
+        else if (seq_index_ == 1) // Check the second byte.
+          valid = belongs (b, byte2_range_.first, byte2_range_.second);
+        else                      // Check the third and forth bytes.
+          valid = belongs (b, 0x80, 0xBF);
+
+        break;
+      }
+    }
+
+    // Bail out if the current UTF-8 sequence byte is invalid.
+    //
+    if (!valid)
+    {
+      // We could probably distinguish "surrogate" and "exceed max value" from
+      // other ill-formedness cases (amend the well-formedness table, keep
+      // decoding the sequence, and test the codepoint in the end) and produce
+      // more specific error messages, but this doesn't seem worth the
+      // trouble.
+      //
+      if (what != nullptr)
+        *what = byte_error ();
+
+      return make_pair (false, false); // Invalid byte.
+    }
+
+    // "Append" the sequence byte's 6 rightmost bits to the resulting
+    // codepoint value, unless this is the first byte (which value is already
+    // taken into account; see above).
+    //
+    if (seq_index_ != 0)
+      codepoint_ = (codepoint_ << 6) | (b & 0x3F);
+
+    // If we didn't get to the end of the UTF-8 sequence, then we are done
+    // with this byte.
+    //
+    if (++seq_index_ != seq_size_)
+      return make_pair (true, false); // Valid byte.
+
+    // Prepare for the next UTF-8 sequence validation, regardless of the
+    // decoded codepoint validity.
+    //
+    seq_index_ = 0;
+
+    // Check the decoded codepoint, unless any codepoint type is allowed.
+    //
+    // Note that the well-formedness sequence check guarantees that we decoded
+    // a valid Unicode codepoint (see above).
+    //
+    if (types_ == codepoint_types::any)
+      return make_pair (true, true); // Valid codepoint.
+
+    // Check if the decoded codepoint is whitelisted.
+    //
+    using traits = u32string::traits_type;
+
+    if (whitelist_ != nullptr &&
+        traits::find (whitelist_, traits::length (whitelist_), codepoint_) !=
+        nullptr)
+      return make_pair (true, true); // Valid codepoint.
+
+    // Now check if the codepoint type matches the specified set. Note: also
+    // covers the `types_ == codepoint_types::none` case.
+    //
+    codepoint_types t (codepoint_type (codepoint_));
+
+    if ((t & types_) != codepoint_types::none)
+      return make_pair (true, true); // Valid codepoint.
+
+    if (what != nullptr)
+      *what = "invalid Unicode codepoint (" + to_string (t) + ")";
+
+    return make_pair (false, true); // Invalid codepoint.
+  }
+
+  inline std::pair<bool, bool> utf8_validator::
+  recover (char c)
+  {
+    // We are recovered if the character can be interpreted as a sequence
+    // leading byte.
+    //
+    // As an optimization, bail out if the byte is a sequence trailing byte
+    // (10xxxxxx).
+    //
+    if ((c & 0xC0) == 0x80)
+      return std::make_pair (false, false); // Invalid byte.
+
+    seq_index_ = 0;
+    return validate (c);
+  }
+
+  inline char32_t utf8_validator::
+  codepoint () const
+  {
+    return codepoint_;
+  }
+}