aboutsummaryrefslogtreecommitdiff
path: root/libbutl/utf8.ixx
diff options
context:
space:
mode:
Diffstat (limited to 'libbutl/utf8.ixx')
-rw-r--r--libbutl/utf8.ixx305
1 files changed, 305 insertions, 0 deletions
diff --git a/libbutl/utf8.ixx b/libbutl/utf8.ixx
new file mode 100644
index 0000000..3d2e092
--- /dev/null
+++ b/libbutl/utf8.ixx
@@ -0,0 +1,305 @@
+// file : libbutl/utf8.ixx -*- C++ -*-
+// license : MIT; see accompanying LICENSE file
+
+namespace butl
+{
+ inline utf8_validator::
+ utf8_validator (codepoint_types ts, const char32_t* wl)
+ : types_ (ts),
+ whitelist_ (wl)
+ {
+ }
+
+ inline std::pair<bool, bool> utf8_validator::
+ validate (char c)
+ {
+ return validate (c, nullptr /* what */);
+ }
+
+ inline std::pair<bool, bool> utf8_validator::
+ validate (char c, std::string& what)
+ {
+ return validate (c, &what);
+ }
+
+ inline std::pair<bool, bool> utf8_validator::
+ validate (char c, std::string* what)
+ {
+ using namespace std;
+
+ // A UCS-4 character is encoded as the UTF-8 byte sequence as follows,
+ // depending on the value range it falls into:
+ //
+ // 0x00000000 - 0x0000007F:
+ // 0xxxxxxx
+ //
+ // 0x00000080 - 0x000007FF:
+ // 110xxxxx 10xxxxxx
+ //
+ // 0x00000800 - 0x0000FFFF:
+ // 1110xxxx 10xxxxxx 10xxxxxx
+ //
+ // 0x00010000 - 0x001FFFFF:
+ // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ //
+ // 0x00200000 - 0x03FFFFFF:
+ // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ //
+ // 0x04000000 - 0x7FFFFFFF:
+ // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ //
+ // Also note that the Unicode Standard (as of 12.1) specifies no
+ // codepoints above 0x10FFFF, so we consider 5- and 6-byte length UTF-8
+ // sequences as invalid (we could have added `unspecified` codepoint type
+ // except that there are no UTF-8 validation tables defined for these
+ // sequences).
+ //
+ unsigned char b (c);
+
+ // Compose the detailed "invalid UTF-8 sequence byte" error.
+ //
+ auto byte_error = [c, b, this] ()
+ {
+ string s ("invalid UTF-8 sequence ");
+
+ const char* names[] = {"first", "second", "third", "forth"};
+ s += names[seq_index_];
+ s += " byte (0x";
+
+ const char digits[] = "0123456789ABCDEF";
+ s += digits[(b >> 4) & 0xF];
+ s += digits[b & 0xF];
+
+ // If the byte happens to be a printable ASCII character then let's
+ // print it as a character as well. This can help a bit with grepping
+ // through text while troubleshooting.
+ //
+ if (b >= 0x20 && b <= 0x7E)
+ {
+ s += " '";
+ s += c;
+ s += "'";
+ }
+
+ s += ")";
+ return s;
+ };
+
+ // Detect the byte sequence length based on its first byte. While at it,
+ // start calculating the resulting Unicode codepoint value.
+ //
+ if (seq_index_ == 0)
+ {
+ if (b < 0x80)
+ {
+ seq_size_ = 1;
+ codepoint_ = b;
+ }
+ else if (b < 0xE0)
+ {
+ seq_size_ = 2;
+ codepoint_ = b & 0x1F; // Takes 5 rightmost bits.
+ }
+ else if (b < 0xF0)
+ {
+ seq_size_ = 3;
+ codepoint_ = b & 0xF; // Takes 4 rightmost bits.
+ }
+ else if (b < 0xF8)
+ {
+ seq_size_ = 4;
+ codepoint_ = b & 0x7; // Takes 3 rightmost bits.
+ }
+ else
+ {
+ if (what != nullptr)
+ {
+ if (b < 0xFE)
+ {
+ *what = b < 0xFC ? "5" : "6";
+ *what += "-byte length UTF-8 sequence";
+ }
+ else
+ *what = byte_error ();
+ }
+
+ return make_pair (false, false); // Invalid byte.
+ }
+ }
+
+ // Note that while a codepoint may potentially be encoded with byte
+ // sequences of different lengths, only the shortest encoding sequence is
+ // considered well-formed. Also a well-formed sequence may not be decoded
+ // into invalid codepoint value (see codepoint_type() for details). We
+ // will check all that using the Well-Formed UTF-8 Byte Sequences table
+ // (provided by the Unicode 12.0 Standard) which also takes care of the
+ // missing UTF-8 sequence bytes.
+ //
+ bool valid (false);
+
+ // Return true if a byte value belongs to the specified range.
+ //
+ auto belongs = [] (unsigned char c, unsigned char l, unsigned char r)
+ {
+ return c >= l && c <= r;
+ };
+
+ switch (seq_size_)
+ {
+ case 1: valid = true; break; // Well-formed by the definition (see above).
+ case 2:
+ {
+ // [000080 0007FF]: [C2 DF] [80 BF]
+ //
+ // Check the first byte and set the second byte range.
+ //
+ if (seq_index_ == 0)
+ {
+ if ((valid = belongs (b, 0xC2, 0xDF)))
+ byte2_range_ = make_pair (0x80, 0xBF);
+ }
+ else // Check the second byte.
+ valid = belongs (b, byte2_range_.first, byte2_range_.second);
+
+ break;
+ }
+ case 3:
+ {
+ // [000800 000FFF]: E0 [A0 BF] [80 BF]
+ // [001000 00CFFF]: [E1 EC] [80 BF] [80 BF]
+ // [00D000 00D7FF]: ED [80 9F] [80 BF] ; Excludes surrogates.
+ // [00E000 00FFFF]: [EE EF] [80 BF] [80 BF]
+ //
+ // Check the first byte and set the second byte range.
+ //
+ if (seq_index_ == 0)
+ {
+ if ((valid = (b == 0xE0)))
+ byte2_range_ = make_pair (0xA0, 0xBF);
+ else if ((valid = belongs (b, 0xE1, 0xEC)))
+ byte2_range_ = make_pair (0x80, 0xBF);
+ else if ((valid = (b == 0xED)))
+ byte2_range_ = make_pair (0x80, 0x9F);
+ else if ((valid = belongs (b, 0xEE, 0xEF)))
+ byte2_range_ = make_pair (0x80, 0xBF);
+ }
+ else if (seq_index_ == 1) // Check the second byte.
+ valid = belongs (b, byte2_range_.first, byte2_range_.second);
+ else // Check the third byte.
+ valid = belongs (b, 0x80, 0xBF);
+
+ break;
+ }
+ case 4:
+ {
+ // [010000 03FFFF]: F0 [90 BF] [80 BF] [80 BF]
+ // [040000 0FFFFF]: [F1 F3] [80 BF] [80 BF] [80 BF]
+ // [100000 10FFFF]: F4 [80 8F] [80 BF] [80 BF]
+ //
+ // Check the first byte and set the second byte range.
+ //
+ if (seq_index_ == 0)
+ {
+ if ((valid = (b == 0xF0)))
+ byte2_range_ = make_pair (0x90, 0xBF);
+ else if ((valid = belongs (b, 0xF1, 0xF3)))
+ byte2_range_ = make_pair (0x80, 0xBF);
+ else if ((valid = (b == 0xF4)))
+ byte2_range_ = make_pair (0x80, 0x8F);
+ }
+ else if (seq_index_ == 1) // Check the second byte.
+ valid = belongs (b, byte2_range_.first, byte2_range_.second);
+ else // Check the third and forth bytes.
+ valid = belongs (b, 0x80, 0xBF);
+
+ break;
+ }
+ }
+
+ // Bail out if the current UTF-8 sequence byte is invalid.
+ //
+ if (!valid)
+ {
+ // We could probably distinguish "surrogate" and "exceed max value" from
+ // other ill-formedness cases (amend the well-formedness table, keep
+ // decoding the sequence, and test the codepoint in the end) and produce
+ // more specific error messages, but this doesn't seem worth the
+ // trouble.
+ //
+ if (what != nullptr)
+ *what = byte_error ();
+
+ return make_pair (false, false); // Invalid byte.
+ }
+
+ // "Append" the sequence byte's 6 rightmost bits to the resulting
+ // codepoint value, unless this is the first byte (which value is already
+ // taken into account; see above).
+ //
+ if (seq_index_ != 0)
+ codepoint_ = (codepoint_ << 6) | (b & 0x3F);
+
+ // If we didn't get to the end of the UTF-8 sequence, then we are done
+ // with this byte.
+ //
+ if (++seq_index_ != seq_size_)
+ return make_pair (true, false); // Valid byte.
+
+ // Prepare for the next UTF-8 sequence validation, regardless of the
+ // decoded codepoint validity.
+ //
+ seq_index_ = 0;
+
+ // Check the decoded codepoint, unless any codepoint type is allowed.
+ //
+ // Note that the well-formedness sequence check guarantees that we decoded
+ // a valid Unicode codepoint (see above).
+ //
+ if (types_ == codepoint_types::any)
+ return make_pair (true, true); // Valid codepoint.
+
+ // Check if the decoded codepoint is whitelisted.
+ //
+ using traits = u32string::traits_type;
+
+ if (whitelist_ != nullptr &&
+ traits::find (whitelist_, traits::length (whitelist_), codepoint_) !=
+ nullptr)
+ return make_pair (true, true); // Valid codepoint.
+
+ // Now check if the codepoint type matches the specified set. Note: also
+ // covers the `types_ == codepoint_types::none` case.
+ //
+ codepoint_types t (codepoint_type (codepoint_));
+
+ if ((t & types_) != codepoint_types::none)
+ return make_pair (true, true); // Valid codepoint.
+
+ if (what != nullptr)
+ *what = "invalid Unicode codepoint (" + to_string (t) + ")";
+
+ return make_pair (false, true); // Invalid codepoint.
+ }
+
+ inline std::pair<bool, bool> utf8_validator::
+ recover (char c)
+ {
+ // We are recovered if the character can be interpreted as a sequence
+ // leading byte.
+ //
+ // As an optimization, bail out if the byte is a sequence trailing byte
+ // (10xxxxxx).
+ //
+ if ((c & 0xC0) == 0x80)
+ return std::make_pair (false, false); // Invalid byte.
+
+ seq_index_ = 0;
+ return validate (c);
+ }
+
+ inline char32_t utf8_validator::
+ codepoint () const
+ {
+ return codepoint_;
+ }
+}