From 5ae9686adac1508873f2d980e84becd3496244c2 Mon Sep 17 00:00:00 2001 From: Karen Arutyunov Date: Wed, 26 Feb 2020 17:16:45 +0300 Subject: Add notion of validator to char_scanner and make sure manifest is UTF-8 This involves implementing utf8_validator and UTF-8 utility functions and using them during the manifest parsing, serialization, and rewriting. --- libbutl/utf8.ixx | 305 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 305 insertions(+) create mode 100644 libbutl/utf8.ixx (limited to 'libbutl/utf8.ixx') diff --git a/libbutl/utf8.ixx b/libbutl/utf8.ixx new file mode 100644 index 0000000..3d2e092 --- /dev/null +++ b/libbutl/utf8.ixx @@ -0,0 +1,305 @@ +// file : libbutl/utf8.ixx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +namespace butl +{ + inline utf8_validator:: + utf8_validator (codepoint_types ts, const char32_t* wl) + : types_ (ts), + whitelist_ (wl) + { + } + + inline std::pair utf8_validator:: + validate (char c) + { + return validate (c, nullptr /* what */); + } + + inline std::pair utf8_validator:: + validate (char c, std::string& what) + { + return validate (c, &what); + } + + inline std::pair utf8_validator:: + validate (char c, std::string* what) + { + using namespace std; + + // A UCS-4 character is encoded as the UTF-8 byte sequence as follows, + // depending on the value range it falls into: + // + // 0x00000000 - 0x0000007F: + // 0xxxxxxx + // + // 0x00000080 - 0x000007FF: + // 110xxxxx 10xxxxxx + // + // 0x00000800 - 0x0000FFFF: + // 1110xxxx 10xxxxxx 10xxxxxx + // + // 0x00010000 - 0x001FFFFF: + // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + // + // 0x00200000 - 0x03FFFFFF: + // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + // + // 0x04000000 - 0x7FFFFFFF: + // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + // + // Also note that the Unicode Standard (as of 12.1) specifies no + // codepoints above 0x10FFFF, so we consider 5- and 6-byte length UTF-8 + // sequences as invalid (we could have added `unspecified` codepoint type + // except that there are no UTF-8 validation tables defined for these + // sequences). + // + unsigned char b (c); + + // Compose the detailed "invalid UTF-8 sequence byte" error. + // + auto byte_error = [c, b, this] () + { + string s ("invalid UTF-8 sequence "); + + const char* names[] = {"first", "second", "third", "forth"}; + s += names[seq_index_]; + s += " byte (0x"; + + const char digits[] = "0123456789ABCDEF"; + s += digits[(b >> 4) & 0xF]; + s += digits[b & 0xF]; + + // If the byte happens to be a printable ASCII character then let's + // print it as a character as well. This can help a bit with grepping + // through text while troubleshooting. + // + if (b >= 0x20 && b <= 0x7E) + { + s += " '"; + s += c; + s += "'"; + } + + s += ")"; + return s; + }; + + // Detect the byte sequence length based on its first byte. While at it, + // start calculating the resulting Unicode codepoint value. + // + if (seq_index_ == 0) + { + if (b < 0x80) + { + seq_size_ = 1; + codepoint_ = b; + } + else if (b < 0xE0) + { + seq_size_ = 2; + codepoint_ = b & 0x1F; // Takes 5 rightmost bits. + } + else if (b < 0xF0) + { + seq_size_ = 3; + codepoint_ = b & 0xF; // Takes 4 rightmost bits. + } + else if (b < 0xF8) + { + seq_size_ = 4; + codepoint_ = b & 0x7; // Takes 3 rightmost bits. + } + else + { + if (what != nullptr) + { + if (b < 0xFE) + { + *what = b < 0xFC ? "5" : "6"; + *what += "-byte length UTF-8 sequence"; + } + else + *what = byte_error (); + } + + return make_pair (false, false); // Invalid byte. + } + } + + // Note that while a codepoint may potentially be encoded with byte + // sequences of different lengths, only the shortest encoding sequence is + // considered well-formed. Also a well-formed sequence may not be decoded + // into invalid codepoint value (see codepoint_type() for details). We + // will check all that using the Well-Formed UTF-8 Byte Sequences table + // (provided by the Unicode 12.0 Standard) which also takes care of the + // missing UTF-8 sequence bytes. + // + bool valid (false); + + // Return true if a byte value belongs to the specified range. + // + auto belongs = [] (unsigned char c, unsigned char l, unsigned char r) + { + return c >= l && c <= r; + }; + + switch (seq_size_) + { + case 1: valid = true; break; // Well-formed by the definition (see above). + case 2: + { + // [000080 0007FF]: [C2 DF] [80 BF] + // + // Check the first byte and set the second byte range. + // + if (seq_index_ == 0) + { + if ((valid = belongs (b, 0xC2, 0xDF))) + byte2_range_ = make_pair (0x80, 0xBF); + } + else // Check the second byte. + valid = belongs (b, byte2_range_.first, byte2_range_.second); + + break; + } + case 3: + { + // [000800 000FFF]: E0 [A0 BF] [80 BF] + // [001000 00CFFF]: [E1 EC] [80 BF] [80 BF] + // [00D000 00D7FF]: ED [80 9F] [80 BF] ; Excludes surrogates. + // [00E000 00FFFF]: [EE EF] [80 BF] [80 BF] + // + // Check the first byte and set the second byte range. + // + if (seq_index_ == 0) + { + if ((valid = (b == 0xE0))) + byte2_range_ = make_pair (0xA0, 0xBF); + else if ((valid = belongs (b, 0xE1, 0xEC))) + byte2_range_ = make_pair (0x80, 0xBF); + else if ((valid = (b == 0xED))) + byte2_range_ = make_pair (0x80, 0x9F); + else if ((valid = belongs (b, 0xEE, 0xEF))) + byte2_range_ = make_pair (0x80, 0xBF); + } + else if (seq_index_ == 1) // Check the second byte. + valid = belongs (b, byte2_range_.first, byte2_range_.second); + else // Check the third byte. + valid = belongs (b, 0x80, 0xBF); + + break; + } + case 4: + { + // [010000 03FFFF]: F0 [90 BF] [80 BF] [80 BF] + // [040000 0FFFFF]: [F1 F3] [80 BF] [80 BF] [80 BF] + // [100000 10FFFF]: F4 [80 8F] [80 BF] [80 BF] + // + // Check the first byte and set the second byte range. + // + if (seq_index_ == 0) + { + if ((valid = (b == 0xF0))) + byte2_range_ = make_pair (0x90, 0xBF); + else if ((valid = belongs (b, 0xF1, 0xF3))) + byte2_range_ = make_pair (0x80, 0xBF); + else if ((valid = (b == 0xF4))) + byte2_range_ = make_pair (0x80, 0x8F); + } + else if (seq_index_ == 1) // Check the second byte. + valid = belongs (b, byte2_range_.first, byte2_range_.second); + else // Check the third and forth bytes. + valid = belongs (b, 0x80, 0xBF); + + break; + } + } + + // Bail out if the current UTF-8 sequence byte is invalid. + // + if (!valid) + { + // We could probably distinguish "surrogate" and "exceed max value" from + // other ill-formedness cases (amend the well-formedness table, keep + // decoding the sequence, and test the codepoint in the end) and produce + // more specific error messages, but this doesn't seem worth the + // trouble. + // + if (what != nullptr) + *what = byte_error (); + + return make_pair (false, false); // Invalid byte. + } + + // "Append" the sequence byte's 6 rightmost bits to the resulting + // codepoint value, unless this is the first byte (which value is already + // taken into account; see above). + // + if (seq_index_ != 0) + codepoint_ = (codepoint_ << 6) | (b & 0x3F); + + // If we didn't get to the end of the UTF-8 sequence, then we are done + // with this byte. + // + if (++seq_index_ != seq_size_) + return make_pair (true, false); // Valid byte. + + // Prepare for the next UTF-8 sequence validation, regardless of the + // decoded codepoint validity. + // + seq_index_ = 0; + + // Check the decoded codepoint, unless any codepoint type is allowed. + // + // Note that the well-formedness sequence check guarantees that we decoded + // a valid Unicode codepoint (see above). + // + if (types_ == codepoint_types::any) + return make_pair (true, true); // Valid codepoint. + + // Check if the decoded codepoint is whitelisted. + // + using traits = u32string::traits_type; + + if (whitelist_ != nullptr && + traits::find (whitelist_, traits::length (whitelist_), codepoint_) != + nullptr) + return make_pair (true, true); // Valid codepoint. + + // Now check if the codepoint type matches the specified set. Note: also + // covers the `types_ == codepoint_types::none` case. + // + codepoint_types t (codepoint_type (codepoint_)); + + if ((t & types_) != codepoint_types::none) + return make_pair (true, true); // Valid codepoint. + + if (what != nullptr) + *what = "invalid Unicode codepoint (" + to_string (t) + ")"; + + return make_pair (false, true); // Invalid codepoint. + } + + inline std::pair utf8_validator:: + recover (char c) + { + // We are recovered if the character can be interpreted as a sequence + // leading byte. + // + // As an optimization, bail out if the byte is a sequence trailing byte + // (10xxxxxx). + // + if ((c & 0xC0) == 0x80) + return std::make_pair (false, false); // Invalid byte. + + seq_index_ = 0; + return validate (c); + } + + inline char32_t utf8_validator:: + codepoint () const + { + return codepoint_; + } +} -- cgit v1.1