From 5ae9686adac1508873f2d980e84becd3496244c2 Mon Sep 17 00:00:00 2001 From: Karen Arutyunov Date: Wed, 26 Feb 2020 17:16:45 +0300 Subject: Add notion of validator to char_scanner and make sure manifest is UTF-8 This involves implementing utf8_validator and UTF-8 utility functions and using them during the manifest parsing, serialization, and rewriting. --- libbutl/char-scanner.cxx | 126 ------------- libbutl/char-scanner.ixx | 56 +++++- libbutl/char-scanner.mxx | 90 +++++++-- libbutl/char-scanner.txx | 146 +++++++++++++++ libbutl/manifest-parser.cxx | 24 ++- libbutl/manifest-parser.ixx | 31 ++++ libbutl/manifest-parser.mxx | 33 +++- libbutl/manifest-rewriter.cxx | 22 ++- libbutl/manifest-serializer.cxx | 84 +++++++-- libbutl/manifest-serializer.mxx | 16 +- libbutl/standard-version.cxx | 2 + libbutl/unicode.cxx | 165 +++++++++++++++++ libbutl/unicode.ixx | 72 ++++++++ libbutl/unicode.mxx | 82 +++++++++ libbutl/utf8.cxx | 342 ----------------------------------- libbutl/utf8.ixx | 305 +++++++++++++++++++++++++++++++ libbutl/utf8.mxx | 130 +++++++++++++ libbutl/utility.cxx | 132 ++++++++++++++ libbutl/utility.ixx | 95 +++++++--- libbutl/utility.mxx | 70 +++---- tests/manifest-parser/driver.cxx | 40 +++- tests/manifest-rewriter/driver.cxx | 4 + tests/manifest-serializer/driver.cxx | 18 ++ tests/utf8/driver.cxx | 207 +++++++++++++++++++-- 24 files changed, 1696 insertions(+), 596 deletions(-) delete mode 100644 libbutl/char-scanner.cxx create mode 100644 libbutl/char-scanner.txx create mode 100644 libbutl/unicode.cxx create mode 100644 libbutl/unicode.ixx create mode 100644 libbutl/unicode.mxx delete mode 100644 libbutl/utf8.cxx create mode 100644 libbutl/utf8.ixx create mode 100644 libbutl/utf8.mxx diff --git a/libbutl/char-scanner.cxx b/libbutl/char-scanner.cxx deleted file mode 100644 index 85416e5..0000000 --- a/libbutl/char-scanner.cxx +++ /dev/null @@ -1,126 +0,0 @@ -// file : libbutl/char-scanner.cxx -*- C++ -*- -// license : MIT; see accompanying LICENSE file - -#ifndef __cpp_modules_ts -#include -#endif - -// C includes. - -#ifndef __cpp_lib_modules_ts -#include // char_traits -#include // uint64_t -#include -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -module butl.char_scanner; - -// Only imports additional to interface. -#ifdef __clang__ -#ifdef __cpp_lib_modules_ts -import std.core; -import std.io; -#endif -import butl.fdstream; -#endif - -#endif - -using namespace std; - -namespace butl -{ - char_scanner:: - char_scanner (istream& is, bool crlf, uint64_t l, uint64_t p) - : line (l), - column (1), - position (p), - is_ (is), - buf_ (dynamic_cast (is.rdbuf ())), - gptr_ (nullptr), - egptr_ (nullptr), - crlf_ (crlf) - { - } - - auto char_scanner:: - peek () -> xchar - { - if (unget_) - return ungetc_; - - if (unpeek_) - return unpeekc_; - - if (eos_) - return xchar (xchar::traits_type::eof (), line, column, position); - - int_type v (peek_ ()); - - if (v == xchar::traits_type::eof ()) - eos_ = true; - else if (crlf_ && v == '\r') - { - int_type v1; - do - { - get_ (); - v1 = peek_ (); - } - while (v1 == '\r'); - - if (v1 != '\n') - { - // We need to make sure subsequent calls to peek() return newline. - // - unpeek_ = true; - unpeekc_ = xchar ('\n', line, column, position); - - if (v1 == xchar::traits_type::eof ()) - eos_ = true; - } - - v = '\n'; - } - - return xchar (v, line, column, position); - } - - void char_scanner:: - get (const xchar& c) - { - if (unget_) - unget_ = false; - else - { - if (unpeek_) - { - unpeek_ = false; - } - // When is_.get () returns eof, the failbit is also set (stupid, - // isn't?) which may trigger an exception. To work around this - // we will call peek() first and only call get() if it is not - // eof. But we can only call peek() on eof once; any subsequent - // calls will spoil the failbit (even more stupid). - // - else if (!eos (c)) - get_ (); - - if (!eos (c)) - { - if (c == '\n') - { - line++; - column = 1; - } - else - column++; - - position = pos_ (); - } - } - } -} diff --git a/libbutl/char-scanner.ixx b/libbutl/char-scanner.ixx index 36cc93d..7e9c4b0 100644 --- a/libbutl/char-scanner.ixx +++ b/libbutl/char-scanner.ixx @@ -3,8 +3,30 @@ namespace butl { - inline auto char_scanner:: - get () -> xchar + template + inline char_scanner:: + char_scanner (std::istream& is, bool crlf, std::uint64_t l, std::uint64_t p) + : char_scanner (is, validator_type (), crlf, l, p) + { + } + + template + inline auto char_scanner:: + peek (std::string& what) -> xchar + { + return peek (&what); + } + + template + inline auto char_scanner:: + peek () -> xchar + { + return peek (nullptr /* what */); + } + + template + inline auto char_scanner:: + get (std::string* what) -> xchar { if (unget_) { @@ -13,13 +35,28 @@ namespace butl } else { - xchar c (peek ()); + xchar c (peek (what)); get (c); return c; } } - inline void char_scanner:: + template + inline auto char_scanner:: + get (std::string& what) -> xchar + { + return get (&what); + } + + template + inline auto char_scanner:: + get () -> xchar + { + return get (nullptr /* what */); + } + + template + inline void char_scanner:: unget (const xchar& c) { // Because iostream::unget cannot work once eos is reached, we have to @@ -29,7 +66,8 @@ namespace butl ungetc_ = c; } - inline auto char_scanner:: + template + inline auto char_scanner:: peek_ () -> int_type { if (gptr_ != egptr_) @@ -48,7 +86,8 @@ namespace butl return r; } - inline void char_scanner:: + template + inline void char_scanner:: get_ () { int_type c; @@ -61,11 +100,14 @@ namespace butl else c = is_.get (); // About as fast as ignore() and way faster than tellg(). + validated_ = false; + if (save_ != nullptr && c != xchar::traits_type::eof ()) save_->push_back (static_cast (c)); } - inline std::uint64_t char_scanner:: + template + inline std::uint64_t char_scanner:: pos_ () const { return buf_ != nullptr ? buf_->tellg () : 0; diff --git a/libbutl/char-scanner.mxx b/libbutl/char-scanner.mxx index 5ad3d61..e57245b 100644 --- a/libbutl/char-scanner.mxx +++ b/libbutl/char-scanner.mxx @@ -10,6 +10,8 @@ #ifndef __cpp_lib_modules_ts #include // char_traits #include // uint64_t +#include // INT_* +#include // pair, make_pair() #include #endif @@ -30,12 +32,26 @@ import butl.fdstream; LIBBUTL_MODEXPORT namespace butl { + // Refer to utf8_validator for details. + // + struct noop_validator + { + std::pair + validate (char) {return std::make_pair (true, true);} + + std::pair + validate (char c, std::string&) {return validate (c);} + }; + // Low-level character stream scanner. Normally used as a base for // higher-level lexers. // - class LIBBUTL_SYMEXPORT char_scanner + template + class char_scanner { public: + using validator_type = V; + // If the crlf argument is true, then recognize Windows newlines (0x0D // 0x0A) and convert them to just '\n' (0x0A). Note that a standalone // 0x0D is treated "as if" it was followed by 0x0A and multiple 0x0D @@ -49,7 +65,13 @@ LIBBUTL_MODEXPORT namespace butl // and position in the stream (useful when re-scanning data saved with the // save_* facility). // - char_scanner (std::istream& is, + char_scanner (std::istream&, + bool crlf = true, + std::uint64_t line = 1, + std::uint64_t position = 0); + + char_scanner (std::istream&, + validator_type, bool crlf = true, std::uint64_t line = 1, std::uint64_t position = 0); @@ -62,10 +84,10 @@ LIBBUTL_MODEXPORT namespace butl public: // Extended character. It includes line/column/position information and is - // capable of representing EOF. + // capable of representing EOF and invalid characters. // - // Note that implicit conversion of EOF to char_type results in NUL - // character (which means in most cases it is safe to compare xchar to + // Note that implicit conversion of EOF/invalid to char_type results in + // NUL character (which means in most cases it is safe to compare xchar to // char without checking for EOF). // class xchar @@ -76,6 +98,9 @@ LIBBUTL_MODEXPORT namespace butl using char_type = traits_type::char_type; int_type value; + + // Note that the column is of the codepoint this byte belongs to. + // std::uint64_t line; std::uint64_t column; @@ -84,9 +109,12 @@ LIBBUTL_MODEXPORT namespace butl // std::uint64_t position; + static int_type + invalid () {return traits_type::eof () != INT_MIN ? INT_MIN : INT_MAX;} + operator char_type () const { - return value != traits_type::eof () + return value != traits_type::eof () && value != invalid () ? static_cast (value) : char_type (0); } @@ -98,27 +126,44 @@ LIBBUTL_MODEXPORT namespace butl : value (v), line (l), column (c), position (p) {} }; + // Note that if any of the get() or peek() functions return an invalid + // character, then the scanning has failed and none of them should be + // called again. + xchar get (); + // As above but in case of an invalid character also return the + // description of why it is invalid. + // + xchar + get (std::string& what); + void get (const xchar& peeked); // Get previously peeked character (faster). void unget (const xchar&); - // Note that if there is an "ungot" character, peek() will return - // that. + // Note that if there is an "ungot" character, peek() will return that. // xchar peek (); - // Tests. In the future we can add tests line alpha(), alnum(), - // etc. + // As above but in case of an invalid character also return the + // description of why it is invalid. + // + xchar + peek (std::string& what); + + // Tests. In the future we can add tests line alpha(), alnum(), etc. // static bool eos (const xchar& c) {return c.value == xchar::traits_type::eof ();} + static bool + invalid (const xchar& c) {return c.value == xchar::invalid ();} + // Line, column and position of the next character to be extracted from // the stream by peek() or get(). // @@ -159,8 +204,8 @@ LIBBUTL_MODEXPORT namespace butl }; protected: - using int_type = xchar::int_type; - using char_type = xchar::char_type; + using int_type = typename xchar::int_type; + using char_type = typename xchar::char_type; int_type peek_ (); @@ -171,11 +216,27 @@ LIBBUTL_MODEXPORT namespace butl std::uint64_t pos_ () const; + xchar + get (std::string* what); + + xchar + peek (std::string* what); + protected: std::istream& is_; - // Note that if you are reading from the buffer directly, then it is - // also your responsibility to save the data. + validator_type val_; + bool decoded_ = true; // The peeked character is last byte of sequence. + bool validated_ = false; // The peeked character has been validated. + + // Note that if you are reading from the buffer directly, then it is also + // your responsibility to call the validator and save the data (see + // save_*(). + // + // Besides that, make sure that the peek() call preceding the scan is + // followed by the get() call (see validated_, decoded_, and unpeek_ for + // the hairy details; realistically, you would probably only direct-scan + // ASCII fragments). // fdbuf* buf_; // NULL if not ifdstream. const char_type* gptr_; @@ -195,3 +256,4 @@ LIBBUTL_MODEXPORT namespace butl } #include +#include diff --git a/libbutl/char-scanner.txx b/libbutl/char-scanner.txx new file mode 100644 index 0000000..d4e2082 --- /dev/null +++ b/libbutl/char-scanner.txx @@ -0,0 +1,146 @@ +// file : libbutl/char-scanner.txx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +#ifndef __cpp_lib_modules_ts +#include // move +#endif + +namespace butl +{ + template + char_scanner:: + char_scanner (std::istream& is, + validator_type v, + bool crlf, + std::uint64_t l, + std::uint64_t p) + : line (l), + column (1), + position (p), + is_ (is), + val_ (std::move (v)), + buf_ (dynamic_cast (is.rdbuf ())), + gptr_ (nullptr), + egptr_ (nullptr), + crlf_ (crlf) + { + } + + template + auto char_scanner:: + peek (std::string* what) -> xchar + { + if (unget_) + return ungetc_; + + if (unpeek_) + return unpeekc_; + + if (eos_) + return xchar (xchar::traits_type::eof (), line, column, position); + + int_type v (peek_ ()); + + if (v == xchar::traits_type::eof ()) + { + if (!decoded_) + { + if (what != nullptr) + *what = "unexpected end of stream"; + + v = xchar::invalid (); + } + + eos_ = true; + } + else + { + auto valid = [what, this] (int_type v) + { + if (validated_) + return true; + + char c (xchar::traits_type::to_char_type (v)); + std::pair r (what != nullptr + ? val_.validate (c, *what) + : val_.validate (c)); + + decoded_ = r.second; + validated_ = true; + return r.first; + }; + + if (!valid (v)) + v = xchar::invalid (); + else if (crlf_ && v == '\r') + { + // Note that '\r' is a valid character (otherwise we won't be here), + // so we don't validate it again below. We also postpone the + // validation of the next non-'\r' character (except EOF) until the + // next peek() call. + // + int_type v1; + do + { + get_ (); // Sets validated_ to false. + v1 = peek_ (); + } + while (v1 == '\r'); + + if (v1 != '\n') + { + // We need to make sure subsequent calls to peek() return newline. + // + unpeek_ = true; + unpeekc_ = xchar ('\n', line, column, position); + + // Note that the previous character is decoded ('\r') and so EOF is + // legitimate. + // + if (v1 == xchar::traits_type::eof ()) + eos_ = true; + } + + v = '\n'; + } + } + + return xchar (v, line, column, position); + } + + template + void char_scanner:: + get (const xchar& c) + { + if (unget_) + unget_ = false; + else + { + if (unpeek_) + { + unpeek_ = false; + } + // When is_.get () returns eof, the failbit is also set (stupid, + // isn't?) which may trigger an exception. To work around this + // we will call peek() first and only call get() if it is not + // eof. But we can only call peek() on eof once; any subsequent + // calls will spoil the failbit (even more stupid). + // + else if (!eos (c)) + get_ (); + + if (!eos (c)) + { + if (c == '\n') + { + line++; + column = 1; + } + else if (decoded_) // The character is the last in a sequence? + column++; + + position = pos_ (); + } + } + } +} diff --git a/libbutl/manifest-parser.cxx b/libbutl/manifest-parser.cxx index 4de59b7..9514bbd 100644 --- a/libbutl/manifest-parser.cxx +++ b/libbutl/manifest-parser.cxx @@ -89,7 +89,7 @@ namespace butl parse_name (r); skip_spaces (); - c = get (); + c = get ("manifest"); if (eos (c)) { @@ -117,7 +117,7 @@ namespace butl skip_spaces (); parse_value (r); - c = peek (); + c = peek ("manifest"); // The character after the value should be either a newline or eos. // @@ -126,7 +126,7 @@ namespace butl r.end_pos = c.position; if (c == '\n') - get (); + get (c); // Now figure out whether what we've got makes sense, depending // on the state we are in. @@ -217,6 +217,8 @@ namespace butl void manifest_parser:: parse_name (name_value& r) { + auto peek = [this] () {return manifest_parser::peek ("manifest name");}; + xchar c (peek ()); r.name_line = c.line; @@ -228,13 +230,19 @@ namespace butl break; r.name += c; - get (); + get (c); } } void manifest_parser:: parse_value (name_value& r) { + auto peek = [this] () {return manifest_parser::peek ("manifest value");}; + + // Here we don't always track the last peeked character. + // + auto get = [this] () {manifest_parser::get ("manifest value");}; + xchar c (peek ()); r.value_line = c.line; @@ -408,6 +416,8 @@ namespace butl pair manifest_parser:: skip_spaces () { + auto peek = [this] () {return manifest_parser::peek ("manifest");}; + xchar c (peek ()); bool start (c.column == 1); uint64_t lp (c.position); @@ -437,12 +447,12 @@ namespace butl if (!start) return make_pair (c, lp); - get (); + get (c); // Read until newline or eos. // for (c = peek (); !eos (c) && c != '\n'; c = peek ()) - get (); + get (c); continue; } @@ -450,7 +460,7 @@ namespace butl return make_pair (c, lp); // Not a space. } - get (); + get (c); } return make_pair (c, lp); diff --git a/libbutl/manifest-parser.ixx b/libbutl/manifest-parser.ixx index e616ad9..bc5246c 100644 --- a/libbutl/manifest-parser.ixx +++ b/libbutl/manifest-parser.ixx @@ -3,6 +3,37 @@ namespace butl { + + inline auto manifest_parser:: + get (const char* what) -> xchar + { + xchar c (base::get (ebuf_)); + + if (invalid (c)) + throw manifest_parsing (name_, + c.line, c.column, + std::string ("invalid ") + what + ": " + ebuf_); + return c; + } + + inline void manifest_parser:: + get (const xchar& peeked) + { + base::get (peeked); + } + + inline auto manifest_parser:: + peek (const char* what) -> xchar + { + xchar c (base::peek (ebuf_)); + + if (invalid (c)) + throw manifest_parsing (name_, + c.line, c.column, + std::string ("invalid ") + what + ": " + ebuf_); + return c; + } + inline manifest_name_value manifest_parser:: next () { diff --git a/libbutl/manifest-parser.mxx b/libbutl/manifest-parser.mxx index adf6181..77addff 100644 --- a/libbutl/manifest-parser.mxx +++ b/libbutl/manifest-parser.mxx @@ -25,10 +25,12 @@ export module butl.manifest_parser; import std.core; import std.io; #endif +import butl.utf8; import butl.optional; import butl.char_scanner; import butl.manifest_types; #else +#include #include #include #include @@ -54,7 +56,8 @@ LIBBUTL_MODEXPORT namespace butl std::string description; }; - class LIBBUTL_SYMEXPORT manifest_parser: protected butl::char_scanner + class LIBBUTL_SYMEXPORT manifest_parser: + protected char_scanner { public: // The filter, if specified, is called by next() prior to returning the @@ -69,7 +72,10 @@ LIBBUTL_MODEXPORT namespace butl manifest_parser (std::istream& is, const std::string& name, std::function filter = {}) - : char_scanner (is), name_ (name), filter_ (std::move (filter)) {} + : char_scanner (is, + utf8_validator (codepoint_types::graphic, U"\n\r\t")), + name_ (name), + filter_ (std::move (filter)) {} const std::string& name () const {return name_;} @@ -97,6 +103,8 @@ LIBBUTL_MODEXPORT namespace butl split_comment (const std::string&); private: + using base = char_scanner; + void parse_next (manifest_name_value&); @@ -114,12 +122,33 @@ LIBBUTL_MODEXPORT namespace butl std::pair skip_spaces (); + // As base::get() but in case of an invalid character throws + // manifest_parsing. + // + xchar + get (const char* what); + + // Get previously peeked character (faster). + // + void + get (const xchar&); + + // As base::peek() but in case of an invalid character throws + // manifest_parsing. + // + xchar + peek (const char* what); + private: const std::string name_; const std::function filter_; enum {start, body, end} s_ = start; std::string version_; // Current format version. + + // Buffer for a get()/peek() potential error. + // + std::string ebuf_; }; // Parse and return a single manifest. Throw manifest_parsing in case of an diff --git a/libbutl/manifest-rewriter.cxx b/libbutl/manifest-rewriter.cxx index ba0c866..e38d5f4 100644 --- a/libbutl/manifest-rewriter.cxx +++ b/libbutl/manifest-rewriter.cxx @@ -30,8 +30,10 @@ import butl.fdstream; import butl.manifest_types; #endif +import butl.utility; // utf8_length() import butl.manifest_serializer; #else +#include #include #endif @@ -101,8 +103,16 @@ namespace butl manifest_serializer s (os, path_.string (), long_lines_); + // Note that the name can be surrounded with the ASCII whitespace + // characters and the start_pos refers to the first character in the + // line. + // + // Also note that we assume the already serialized name to be a valid + // UTF-8 byte string and so utf8_length() may not throw. + // s.write_value (nv.value, - static_cast (nv.colon_pos - nv.start_pos + 2)); + static_cast (nv.colon_pos - nv.start_pos) - + (nv.name.size () - utf8_length (nv.name)) + 2); } os << suffix; @@ -128,15 +138,21 @@ namespace butl os << '\n'; manifest_serializer s (os, path_.string (), long_lines_); - s.write_name (nv.name); + size_t n (s.write_name (nv.name)); os << ':'; if (!nv.value.empty ()) { os << ' '; + + // Note that the name can be surrounded with the ASCII whitespace + // characters and the start_pos refers to the first character in the + // line. + // s.write_value (nv.value, - static_cast (nv.colon_pos - nv.start_pos + 2)); + static_cast (nv.colon_pos - nv.start_pos) - + (nv.name.size () - n) + 2); } os << suffix; diff --git a/libbutl/manifest-serializer.cxx b/libbutl/manifest-serializer.cxx index 0a81478..6a26a15 100644 --- a/libbutl/manifest-serializer.cxx +++ b/libbutl/manifest-serializer.cxx @@ -30,6 +30,11 @@ import std.io; import butl.manifest_types; #endif +import butl.utf8; +import butl.utility; +#else +#include +#include #endif using namespace std; @@ -86,13 +91,13 @@ namespace butl break; } - write_name (n); + size_t l (write_name (n)); os_ << ':'; if (!v.empty ()) { os_ << ' '; - write_value (v, n.size () + 2); + write_value (v, l + 2); } os_ << endl; @@ -111,6 +116,10 @@ namespace butl if (s_ == end) throw serialization (name_, "serialization after eos"); + string what; + if (!utf8 (t, what, codepoint_types::graphic, U"\n\r\t")) + throw serialization (name_, "invalid comment: " + what); + os_ << '#'; if (!t.empty ()) @@ -144,7 +153,7 @@ namespace butl return r; } - void manifest_serializer:: + size_t manifest_serializer:: write_name (const string& n) { if (n.empty ()) @@ -153,43 +162,76 @@ namespace butl if (n[0] == '#') throw serialization (name_, "name starts with '#'"); + size_t r (0); + pair v; + utf8_validator val (codepoint_types::graphic, U"\n\r\t"); + + string what; for (char c: n) { - switch (c) + v = val.validate (c, what); + + if (!v.first) + throw serialization (name_, "invalid name: " + what); + + if (v.second) // Sequence last byte? { - case ' ': - case '\t': - case '\r': - case '\n': throw serialization (name_, "name contains whitespace"); - case ':': throw serialization (name_, "name contains ':'"); - default: break; + // Note: ASCII characters may not be a part of a multi-byte sequence. + // + switch (c) + { + case ' ': + case '\t': + case '\r': + case '\n': throw serialization (name_, "name contains whitespace"); + case ':': throw serialization (name_, "name contains ':'"); + default: break; + } + + ++r; } } + // Make sure that the last UTF-8 sequence is complete. + // + if (!v.second) + throw serialization (name_, "invalid name: incomplete UTF-8 sequence"); + os_ << n; + return r; } void manifest_serializer:: write_value (const char* s, size_t n, size_t cl) { + utf8_validator val (codepoint_types::graphic, U"\n\r\t"); + char c ('\0'); + bool b (true); // Begin of UTF-8 byte sequence. - // The idea is to break on the 77th character (i.e., write it - // on the next line) which means we have written 76 characters + // The idea is to break on the 77th codepoint (i.e., write it + // on the next line) which means we have written 76 codepoints // on this line plus 2 for '\' and '\n', which gives us 78. // - for (const char* e (s + n); s != e; s++, cl++) + string what; + for (const char* e (s + n); s != e; s++) { char pc (c); c = *s; + pair v (val.validate (c, what)); + + if (!v.first) + throw serialization (name_, "invalid value: " + what); + // Note that even the "hard" break (see below) is not that hard when it // comes to breaking the line right after the backslash. Doing so would // inject the redundant newline character, as the line-terminating // backslash would be escaped. So we delay breaking till the next - // non-backslash character. + // non-backslash character. We also delay until the beginning of a UTF-8 + // sequence. // - if (pc != '\\' && !long_lines_) + if (pc != '\\' && b && !long_lines_) { bool br (false); // Break the line. @@ -237,8 +279,18 @@ namespace butl } os_ << c; + + b = v.second; + + if (b) + ++cl; } + // Make sure that the last UTF-8 sequence is complete. + // + if (!b) + throw serialization (name_, "invalid value: incomplete UTF-8 sequence"); + // What comes next is always a newline. If the last character that // we have written is a backslash, escape it. // @@ -256,7 +308,7 @@ namespace butl // Use the multi-line mode in any of the following cases: // - // - column offset is too large (say greater than 39 (78/2) characters; we + // - column offset is too large (say greater than 39 (78/2) codepoints; we // cannot start on the next line since that would start the multi-line // mode) // - value contains newlines diff --git a/libbutl/manifest-serializer.mxx b/libbutl/manifest-serializer.mxx index f114ffb..b73c255 100644 --- a/libbutl/manifest-serializer.mxx +++ b/libbutl/manifest-serializer.mxx @@ -60,7 +60,7 @@ LIBBUTL_MODEXPORT namespace butl const std::string& value); // Unless long_lines is true, break lines in values (including multi-line) - // so that their length does not exceed 78 characters (including '\n'). + // so that their length does not exceed 78 codepoints (including '\n'). // manifest_serializer (std::ostream& os, const std::string& name, @@ -108,23 +108,23 @@ LIBBUTL_MODEXPORT namespace butl void write_next (const std::string& name, const std::string& value); - // Validate and write a name. + // Validate and write a name and return its length in codepoints. // - void + size_t write_name (const std::string&); // Write a value assuming the current line already has the specified - // offset. If the resulting line length would be too large then the - // multi-line representation will be used. It is assumed that the name, - // followed by the colon, is already written. + // codepoint offset. If the resulting line length would be too large then + // the multi-line representation will be used. It is assumed that the + // name, followed by the colon, is already written. // void write_value (const std::string&, std::size_t offset); // Write the specified number of characters from the specified string // (assuming there are no newlines) split into multiple lines at or near - // the 78 characters boundary. Assume the current line already has the - // specified offset. + // the 78 codepoints boundary. Assume the current line already has the + // specified codepoint offset. // void write_value (const char* s, std::size_t n, std::size_t offset); diff --git a/libbutl/standard-version.cxx b/libbutl/standard-version.cxx index c27b064..a9f5eb8 100644 --- a/libbutl/standard-version.cxx +++ b/libbutl/standard-version.cxx @@ -41,6 +41,8 @@ using namespace std; namespace butl { + using std::to_string; + // Parse uint64_t from the specified string starting at the specified // position and check the min/max constraints. If successful, save the // result, update the position to point to the next character, and return diff --git a/libbutl/unicode.cxx b/libbutl/unicode.cxx new file mode 100644 index 0000000..4219846 --- /dev/null +++ b/libbutl/unicode.cxx @@ -0,0 +1,165 @@ +// file : libbutl/unicode.cxx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +#ifndef __cpp_modules_ts +#include +#endif + +#ifndef __cpp_lib_modules_ts +#include +#include +#include + +#include // size_t +#include // pair +#include // lower_bound() +#endif + +#ifdef __cpp_modules_ts +module butl.unicode; + +// Only imports additional to interface. +#ifdef __clang__ +#ifdef __cpp_lib_modules_ts +import std.core; +import std.io; +#endif +#endif + +#endif + +using namespace std; + +namespace butl +{ + // Sorted arrays of the Unicode codepoint ranges corresponding to the + // codepoint types (see the Types of Code Points table in the Unicode 12.0 + // Standard for details). Note that code type range lists (but not ranges + // themselves) may overlap. + // + // Also note that the graphic type codepoints are numerous and scattered. + // Thus, we will consider a codepoint to be of the graphic type if it is not + // of any other type. + // + using codepoint_range = pair; + + static const codepoint_range cn_rs[] = // Control. + { + {0x00, 0x1F}, + {0x7F, 0x9F} + }; + + static const codepoint_range fr_rs[] = // Format. + { + {0x000AD, 0x000AD}, + {0x00600, 0x00605}, + {0x0061C, 0x0061C}, + {0x006DD, 0x006DD}, + {0x0070F, 0x0070F}, + {0x008E2, 0x008E2}, + {0x0180E, 0x0180E}, + {0x0200B, 0x0200F}, + {0x0202A, 0x0202E}, + {0x02060, 0x02064}, + {0x02066, 0x0206F}, + {0x0FEFF, 0x0FEFF}, + {0x0FFF9, 0x0FFFB}, + {0x110BD, 0x110BD}, + {0x110CD, 0x110CD}, + {0x13430, 0x13438}, + {0x1BCA0, 0x1BCA3}, + {0x1D173, 0x1D17A}, + {0xE0001, 0xE0001}, + {0xE0020, 0xE007F} + }; + + static const codepoint_range pr_rs[] = // Private-use. + { + {0x00E000, 0x00F8FF}, + {0x0F0000, 0x10FFFF} + }; + + static const codepoint_range nc_rs[] = // Non-character. + { + {0xFDD0, 0xFDEF} + }; + + static const codepoint_range rs_rs[] = // Reserved. + { + {0x30000, 0xE0000}, + {0xE0002, 0xE001F}, + {0xE0080, 0xE00FF}, + {0xE01F0, 0xEFFFF} + }; + + struct codepoint_type_ranges + { + codepoint_types type; + const codepoint_range* begin; + const codepoint_range* end; + }; + + static const codepoint_type_ranges ct_ranges[] = + { + { + codepoint_types::control, + cn_rs, + cn_rs + sizeof (cn_rs) / sizeof (*cn_rs) + }, + { + codepoint_types::format, + fr_rs, + fr_rs + sizeof (fr_rs) / sizeof (*fr_rs) + }, + { + codepoint_types::private_use, + pr_rs, + pr_rs + sizeof (pr_rs) / sizeof (*pr_rs) + }, + { + codepoint_types::non_character, + nc_rs, + nc_rs + sizeof (nc_rs) / sizeof (*nc_rs) + }, + { + codepoint_types::reserved, + rs_rs, + rs_rs + sizeof (rs_rs) / sizeof (*rs_rs) + } + }; + + // Return the codepoint type of a range if the codepoint value falls into + // one and the graphic type otherwise. + // + // Note that this is a type detection fallback (see codepoint_type() for + // details). + // + codepoint_types + codepoint_type_lookup (char32_t c) + { + // Note that the codepoint type range lists may overlap. Thus, we iterate + // over all of them until there is a match. + // + for (size_t i (0); i != sizeof (ct_ranges) / sizeof (*ct_ranges); ++i) + { + const codepoint_type_ranges& rs (ct_ranges[i]); + + // Find the range that either contains the codepoint or lays to the + // right of it. Note that here we assume a range to be less than a + // codepoint value if it lays to the left of the codepoint. + // + const codepoint_range* r ( + lower_bound (rs.begin, rs.end, + c, + [] (const codepoint_range& r, char32_t c) + { + return r.second < c; + })); + + if (r != rs.end && r->first <= c) // Contains the codepoint? + return rs.type; + } + + return codepoint_types::graphic; + } +} diff --git a/libbutl/unicode.ixx b/libbutl/unicode.ixx new file mode 100644 index 0000000..cba4fd2 --- /dev/null +++ b/libbutl/unicode.ixx @@ -0,0 +1,72 @@ +// file : libbutl/unicode.ixx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +namespace butl +{ + inline codepoint_types + operator&= (codepoint_types& x, codepoint_types y) + { + return x = static_cast ( + static_cast (x) & + static_cast (y)); + } + + inline codepoint_types + operator|= (codepoint_types& x, codepoint_types y) + { + return x = static_cast ( + static_cast (x) | + static_cast (y)); + } + + inline codepoint_types + operator& (codepoint_types x, codepoint_types y) + { + return x &= y; + } + + inline codepoint_types + operator| (codepoint_types x, codepoint_types y) + { + return x |= y; + } + + LIBBUTL_SYMEXPORT codepoint_types + codepoint_type_lookup (char32_t); + + inline codepoint_types + codepoint_type (char32_t c) + { + // Optimize for the common case (printable ASCII characters). + // + if (c >= 0x20 && c <= 0x7E) // Printable ASCII? + return codepoint_types::graphic; + else if (c > 0x10FFFF || (c >= 0xD800 && c <= 0xDFFF)) // Invalid? + return codepoint_types::none; + else if ((c & 0xFFFF) >= 0xFFFE) // Non-range based? + return codepoint_types::non_character; + else + return codepoint_type_lookup (c); + } + + inline std::string + to_string (codepoint_types t) + { + // Note that we use the terms from the Unicode standard ("private-use" + // rather than "private use", "noncharacter" rather than "non-character"). + // + switch (t) + { + case codepoint_types::graphic: return "graphic"; + case codepoint_types::format: return "format"; + case codepoint_types::control: return "control"; + case codepoint_types::private_use: return "private-use"; + case codepoint_types::non_character: return "noncharacter"; // No dash. + case codepoint_types::reserved: return "reserved"; + case codepoint_types::none: + case codepoint_types::any: return ""; + } + + return ""; // Types combination. + } +} diff --git a/libbutl/unicode.mxx b/libbutl/unicode.mxx new file mode 100644 index 0000000..b846476 --- /dev/null +++ b/libbutl/unicode.mxx @@ -0,0 +1,82 @@ +// file : libbutl/unicode.mxx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +#ifndef __cpp_modules_ts +#pragma once +#endif + +// C includes. + +#ifndef __cpp_lib_modules_ts +#include +#include +#include // uint16_t +#endif + +// Other includes. + +#ifdef __cpp_modules_ts +export module butl.unicode; +#ifdef __cpp_lib_modules_ts +import std.core; +import std.io; +#endif +#endif + +#include + +LIBBUTL_MODEXPORT namespace butl +{ + // Note that the Unicode Standard requires the surrogates ([D800 DFFF]) to + // only be used in the context of the UTF-16 character encoding form. Thus, + // we omit the surrogate codepoint type and assume surrogates as invalid + // codepoints. + // + enum class codepoint_types: std::uint16_t + { + // Useful to denote invalid codepoints or when building the type set + // incrementally. + // + none = 0x00, + + graphic = 0x01, // L(etter), M(ark), N(number), P(uncturation), + // S(symbol), Zs(separator, space) + format = 0x02, + control = 0x04, + private_use = 0x08, + non_character = 0x10, + reserved = 0x20, + + any = 0x3f + }; + + codepoint_types operator& (codepoint_types, codepoint_types); + codepoint_types operator| (codepoint_types, codepoint_types); + codepoint_types operator&= (codepoint_types&, codepoint_types); + codepoint_types operator|= (codepoint_types&, codepoint_types); + + // Return the codepoint type for a valid codepoint value and none otherwise. + // + // Note that the valid codepoint ranges are [0 D800) and (DFFF 10FFFF]. + // + codepoint_types + codepoint_type (char32_t); + + // Return the type name for a single codepoint type and empty string for + // `none` and `any`. + // + // Potential future improvements: + // - add the none value name parameter ("invalid" by default) + // - produce names for type masks ("graphic, format", "any", etc) + // + std::string + to_string (codepoint_types); + + inline std::ostream& + operator<< (std::ostream& os, codepoint_types ts) + { + return os << to_string (ts); + } +} + +#include diff --git a/libbutl/utf8.cxx b/libbutl/utf8.cxx deleted file mode 100644 index 0f24559..0000000 --- a/libbutl/utf8.cxx +++ /dev/null @@ -1,342 +0,0 @@ -// file : libbutl/utf8.cxx -*- C++ -*- -// license : MIT; see accompanying LICENSE file - -#ifndef __cpp_modules_ts -#include -#endif - -#ifndef __cpp_lib_modules_ts -#include -#include - -#include // lower_bound() -#endif - -#ifdef __cpp_modules_ts -module butl.utility; - -// Only imports additional to interface. -#ifdef __clang__ -#ifdef __cpp_lib_modules_ts -import std.core; -import std.io; -#endif -#endif - -#endif - -namespace butl -{ - using namespace std; - - // Sorted arrays of the Unicode codepoint ranges corresponding to the - // codepoint types. Note that code type range lists (but not ranges - // themselves) may overlap. - // - // Note that the graphic type codepoints are numerous and scattered. Thus, - // we will consider a codepoint to be of the graphic type if it is not of - // any other type. - // - using codepoint_range = pair; - - static const codepoint_range cn_rs[] = // Control. - { - {0x00, 0x1F}, - {0x7F, 0x9F} - }; - - static const codepoint_range fr_rs[] = // Format. - { - {0x000AD, 0x000AD}, - {0x00600, 0x00605}, - {0x0061C, 0x0061C}, - {0x006DD, 0x006DD}, - {0x0070F, 0x0070F}, - {0x008E2, 0x008E2}, - {0x0180E, 0x0180E}, - {0x0200B, 0x0200F}, - {0x0202A, 0x0202E}, - {0x02060, 0x02064}, - {0x02066, 0x0206F}, - {0x0FEFF, 0x0FEFF}, - {0x0FFF9, 0x0FFFB}, - {0x110BD, 0x110BD}, - {0x110CD, 0x110CD}, - {0x13430, 0x13438}, - {0x1BCA0, 0x1BCA3}, - {0x1D173, 0x1D17A}, - {0xE0001, 0xE0001}, - {0xE0020, 0xE007F} - }; - - static const codepoint_range pr_rs[] = // Private-use. - { - {0x00E000, 0x00F8FF}, - {0x0F0000, 0x10FFFF} - }; - - static const codepoint_range nc_rs[] = // Non-character. - { - {0xFDD0, 0xFDEF} - }; - - static const codepoint_range rs_rs[] = // Reserved. - { - {0x30000, 0xE0000}, - {0xE0002, 0xE001F}, - {0xE0080, 0xE00FF}, - {0xE01F0, 0xEFFFF} - }; - - struct codepoint_type_ranges - { - codepoint_types type; - const codepoint_range* begin; - const codepoint_range* end; - }; - - static const codepoint_type_ranges ct_ranges[] = - { - { - codepoint_types::control, - cn_rs, - cn_rs + sizeof (cn_rs) / sizeof (*cn_rs) - }, - { - codepoint_types::format, - fr_rs, - fr_rs + sizeof (fr_rs) / sizeof (*fr_rs) - }, - { - codepoint_types::private_use, - pr_rs, - pr_rs + sizeof (pr_rs) / sizeof (*pr_rs) - }, - { - codepoint_types::non_character, - nc_rs, - nc_rs + sizeof (nc_rs) / sizeof (*nc_rs) - }, - { - codepoint_types::reserved, - rs_rs, - rs_rs + sizeof (rs_rs) / sizeof (*rs_rs) - } - }; - - bool - utf8 (const string& s, codepoint_types ts, const char32_t* wl) - { - // A UCS-4 character is encoded as the UTF-8 byte sequence as follows, - // depending on the value range it falls into: - // - // 0x00000000 - 0x0000007F: - // 0xxxxxxx - // - // 0x00000080 - 0x000007FF: - // 110xxxxx 10xxxxxx - // - // 0x00000800 - 0x0000FFFF: - // 1110xxxx 10xxxxxx 10xxxxxx - // - // 0x00010000 - 0x001FFFFF: - // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx - // - // 0x00200000 - 0x03FFFFFF: - // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - // - // 0x04000000 - 0x7FFFFFFF: - // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx - // - // Also note that the Unicode Standard (as of 12.1) specifies no - // codepoints above 0x10FFFF, so we consider 5- and 6-byte length UTF-8 - // sequences as invalid (we could have added `unspecified` codepoint type - // except that there are no UTF-8 validation tables defined for these - // sequences). - // - size_t n (s.size ()); - - for (size_t i (0); i != n; ) - { - // Detect the UTF-8 byte sequence length based on its first byte. While - // at it, start calculating the Unicode codepoint value. - // - size_t sn; - char32_t c; - unsigned char b1 (s[i]); - - if (b1 < 0x80) - { - sn = 1; - c = b1; - } - else if (b1 < 0xE0) - { - sn = 2; - c = b1 & 0x1F; // Takes 5 rightmost bits of the first sequence byte. - } - else if (b1 < 0xF0) - { - sn = 3; - c = b1 & 0xF; // Takes 4 rightmost bits of the first sequence byte. - } - else if (b1 < 0xF8) - { - sn = 4; - c = b1 & 0x7; // Takes 3 rightmost bits of the first sequence byte. - } - else - return false; // The byte starts 5- or 6-byte length sequence. - - // Bail out if the string doesn't contain all the requred codepoint - // encoding bytes. - // - if (sn > n - i) - return false; - - // Note that while a codepoint may potentially be encoded with byte - // sequences of different lengths, only the shortest encoding sequence - // is considered well-formed. Also a well-formed sequence may not be - // decoded into a UTF-16 surrogate value ([D800 DFFF]) or a value that - // is greater than the max codepoint value (0x10FFFF). We will check all - // that using the Well-Formed UTF-8 Byte Sequences table (provided by - // the Unicode 12.0 Standard) which also takes care of the missing UTF-8 - // sequence bytes. - // - // Return true if a byte value belongs to the specified range. - // - auto belongs = [] (unsigned char c, unsigned char l, unsigned char r) - { - return c >= l && c <= r; - }; - - switch (sn) - { - case 1: break; // Always well-formed by the definition (see above). - case 2: - { - // [000080 0007FF]: [C2 DF] [80 BF] - // - // Check the first/second bytes combinations: - // - if (!(belongs (b1, 0xC2, 0xDF) && belongs (s[i + 1], 0x80, 0xBF))) - return false; - - break; - } - case 3: - { - // [000800 000FFF]: E0 [A0 BF] [80 BF] - // [001000 00CFFF]: [E1 EC] [80 BF] [80 BF] - // [00D000 00D7FF]: ED [80 9F] [80 BF] ; Excludes surrogates. - // [00E000 00FFFF]: [EE EF] [80 BF] [80 BF] - // - unsigned char b2 (s[i + 1]); - - if (!((b1 == 0xE0 && belongs (b2, 0xA0, 0xBF)) || - (belongs (b1, 0xE1, 0xEC) && belongs (b2, 0x80, 0xBF)) || - (b1 == 0xED && belongs (b2, 0x80, 0x9F)) || - (belongs (b1, 0xEE, 0xEF) && belongs (b2, 0x80, 0xBF))) || - !belongs (s[i + 2], 0x80, 0xBF)) - return false; - - break; - } - case 4: - { - // [010000 03FFFF]: F0 [90 BF] [80 BF] [80 BF] - // [040000 0FFFFF]: [F1 F3] [80 BF] [80 BF] [80 BF] - // [100000 10FFFF]: F4 [80 8F] [80 BF] [80 BF] - // - unsigned char b2 (s[i + 1]); - - if (!((b1 == 0xF0 && belongs (b2, 0x90, 0xBF)) || - (belongs (b1, 0xF1, 0xF3) && belongs (b2, 0x80, 0xBF)) || - (b1 == 0xF4 && belongs (b2, 0x80, 0x8F))) || - !belongs (s[i + 2], 0x80, 0xBF) || - !belongs (s[i + 3], 0x80, 0xBF)) - return false; - - break; - } - } - - // For the remaining sequence bytes, "append" their 6 rightmost bits to - // the resulting codepoint value. - // - --sn; - ++i; - - for (size_t n (i + sn); i != n; ++i) - c = (c << 6) | (s[i] & 0x3F); - - // Check the decoded codepoint, unless any codepoint type is allowed. - // - if (ts == codepoint_types::any) - continue; - - using traits = u32string::traits_type; - - // Check if the decoded codepoint is whitelisted. - // - if (wl != nullptr && - traits::find (wl, traits::length (wl), c) != nullptr) - continue; - - // Match the decoded codepoint type against the specified type set. - // - // Detect the codepoint type (see the Types of Code Points table in the - // Unicode 12.0 Standard for details). - // - codepoint_types ct; - - // Optimize for the common case (printable ASCII characters). - // - if (c >= 0x20 && c <= 0x7E) - ct = codepoint_types::graphic; - else if ((c & 0xFFFF) >= 0xFFFE) // Non-range based detection. - ct = codepoint_types::non_character; - else - { - // Note that we consider a codepoint to be of the graphic type if it - // is not of any other type (see above). - // - ct = codepoint_types::graphic; - - // Note that the codepoint type range lists may overlap. Thus, we - // iterate over all of them until there is a match. - // - for (size_t i (0); i != sizeof (ct_ranges) / sizeof (*ct_ranges); ++i) - { - const codepoint_type_ranges& rs (ct_ranges[i]); - - // Find the range that either contains the codepoint or lays to the - // right of it. Note that here we assume a range to be less than a - // codepoint if it lays to the left of the codepoint. - // - const codepoint_range* r ( - lower_bound (rs.begin, rs.end, - c, - [] (const codepoint_range& r, char32_t c) - { - return r.second < c; - })); - - if (r != rs.end && r->first <= c) // Contains the codepoint? - { - ct = rs.type; - break; - } - } - } - - // Now check if the codepoint type matches the specified set. Note: also - // covers the `ts == codepoint_types::none` case. - // - if ((ct & ts) == codepoint_types::none) - return false; - } - - return true; - } -} diff --git a/libbutl/utf8.ixx b/libbutl/utf8.ixx new file mode 100644 index 0000000..3d2e092 --- /dev/null +++ b/libbutl/utf8.ixx @@ -0,0 +1,305 @@ +// file : libbutl/utf8.ixx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +namespace butl +{ + inline utf8_validator:: + utf8_validator (codepoint_types ts, const char32_t* wl) + : types_ (ts), + whitelist_ (wl) + { + } + + inline std::pair utf8_validator:: + validate (char c) + { + return validate (c, nullptr /* what */); + } + + inline std::pair utf8_validator:: + validate (char c, std::string& what) + { + return validate (c, &what); + } + + inline std::pair utf8_validator:: + validate (char c, std::string* what) + { + using namespace std; + + // A UCS-4 character is encoded as the UTF-8 byte sequence as follows, + // depending on the value range it falls into: + // + // 0x00000000 - 0x0000007F: + // 0xxxxxxx + // + // 0x00000080 - 0x000007FF: + // 110xxxxx 10xxxxxx + // + // 0x00000800 - 0x0000FFFF: + // 1110xxxx 10xxxxxx 10xxxxxx + // + // 0x00010000 - 0x001FFFFF: + // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + // + // 0x00200000 - 0x03FFFFFF: + // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + // + // 0x04000000 - 0x7FFFFFFF: + // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + // + // Also note that the Unicode Standard (as of 12.1) specifies no + // codepoints above 0x10FFFF, so we consider 5- and 6-byte length UTF-8 + // sequences as invalid (we could have added `unspecified` codepoint type + // except that there are no UTF-8 validation tables defined for these + // sequences). + // + unsigned char b (c); + + // Compose the detailed "invalid UTF-8 sequence byte" error. + // + auto byte_error = [c, b, this] () + { + string s ("invalid UTF-8 sequence "); + + const char* names[] = {"first", "second", "third", "forth"}; + s += names[seq_index_]; + s += " byte (0x"; + + const char digits[] = "0123456789ABCDEF"; + s += digits[(b >> 4) & 0xF]; + s += digits[b & 0xF]; + + // If the byte happens to be a printable ASCII character then let's + // print it as a character as well. This can help a bit with grepping + // through text while troubleshooting. + // + if (b >= 0x20 && b <= 0x7E) + { + s += " '"; + s += c; + s += "'"; + } + + s += ")"; + return s; + }; + + // Detect the byte sequence length based on its first byte. While at it, + // start calculating the resulting Unicode codepoint value. + // + if (seq_index_ == 0) + { + if (b < 0x80) + { + seq_size_ = 1; + codepoint_ = b; + } + else if (b < 0xE0) + { + seq_size_ = 2; + codepoint_ = b & 0x1F; // Takes 5 rightmost bits. + } + else if (b < 0xF0) + { + seq_size_ = 3; + codepoint_ = b & 0xF; // Takes 4 rightmost bits. + } + else if (b < 0xF8) + { + seq_size_ = 4; + codepoint_ = b & 0x7; // Takes 3 rightmost bits. + } + else + { + if (what != nullptr) + { + if (b < 0xFE) + { + *what = b < 0xFC ? "5" : "6"; + *what += "-byte length UTF-8 sequence"; + } + else + *what = byte_error (); + } + + return make_pair (false, false); // Invalid byte. + } + } + + // Note that while a codepoint may potentially be encoded with byte + // sequences of different lengths, only the shortest encoding sequence is + // considered well-formed. Also a well-formed sequence may not be decoded + // into invalid codepoint value (see codepoint_type() for details). We + // will check all that using the Well-Formed UTF-8 Byte Sequences table + // (provided by the Unicode 12.0 Standard) which also takes care of the + // missing UTF-8 sequence bytes. + // + bool valid (false); + + // Return true if a byte value belongs to the specified range. + // + auto belongs = [] (unsigned char c, unsigned char l, unsigned char r) + { + return c >= l && c <= r; + }; + + switch (seq_size_) + { + case 1: valid = true; break; // Well-formed by the definition (see above). + case 2: + { + // [000080 0007FF]: [C2 DF] [80 BF] + // + // Check the first byte and set the second byte range. + // + if (seq_index_ == 0) + { + if ((valid = belongs (b, 0xC2, 0xDF))) + byte2_range_ = make_pair (0x80, 0xBF); + } + else // Check the second byte. + valid = belongs (b, byte2_range_.first, byte2_range_.second); + + break; + } + case 3: + { + // [000800 000FFF]: E0 [A0 BF] [80 BF] + // [001000 00CFFF]: [E1 EC] [80 BF] [80 BF] + // [00D000 00D7FF]: ED [80 9F] [80 BF] ; Excludes surrogates. + // [00E000 00FFFF]: [EE EF] [80 BF] [80 BF] + // + // Check the first byte and set the second byte range. + // + if (seq_index_ == 0) + { + if ((valid = (b == 0xE0))) + byte2_range_ = make_pair (0xA0, 0xBF); + else if ((valid = belongs (b, 0xE1, 0xEC))) + byte2_range_ = make_pair (0x80, 0xBF); + else if ((valid = (b == 0xED))) + byte2_range_ = make_pair (0x80, 0x9F); + else if ((valid = belongs (b, 0xEE, 0xEF))) + byte2_range_ = make_pair (0x80, 0xBF); + } + else if (seq_index_ == 1) // Check the second byte. + valid = belongs (b, byte2_range_.first, byte2_range_.second); + else // Check the third byte. + valid = belongs (b, 0x80, 0xBF); + + break; + } + case 4: + { + // [010000 03FFFF]: F0 [90 BF] [80 BF] [80 BF] + // [040000 0FFFFF]: [F1 F3] [80 BF] [80 BF] [80 BF] + // [100000 10FFFF]: F4 [80 8F] [80 BF] [80 BF] + // + // Check the first byte and set the second byte range. + // + if (seq_index_ == 0) + { + if ((valid = (b == 0xF0))) + byte2_range_ = make_pair (0x90, 0xBF); + else if ((valid = belongs (b, 0xF1, 0xF3))) + byte2_range_ = make_pair (0x80, 0xBF); + else if ((valid = (b == 0xF4))) + byte2_range_ = make_pair (0x80, 0x8F); + } + else if (seq_index_ == 1) // Check the second byte. + valid = belongs (b, byte2_range_.first, byte2_range_.second); + else // Check the third and forth bytes. + valid = belongs (b, 0x80, 0xBF); + + break; + } + } + + // Bail out if the current UTF-8 sequence byte is invalid. + // + if (!valid) + { + // We could probably distinguish "surrogate" and "exceed max value" from + // other ill-formedness cases (amend the well-formedness table, keep + // decoding the sequence, and test the codepoint in the end) and produce + // more specific error messages, but this doesn't seem worth the + // trouble. + // + if (what != nullptr) + *what = byte_error (); + + return make_pair (false, false); // Invalid byte. + } + + // "Append" the sequence byte's 6 rightmost bits to the resulting + // codepoint value, unless this is the first byte (which value is already + // taken into account; see above). + // + if (seq_index_ != 0) + codepoint_ = (codepoint_ << 6) | (b & 0x3F); + + // If we didn't get to the end of the UTF-8 sequence, then we are done + // with this byte. + // + if (++seq_index_ != seq_size_) + return make_pair (true, false); // Valid byte. + + // Prepare for the next UTF-8 sequence validation, regardless of the + // decoded codepoint validity. + // + seq_index_ = 0; + + // Check the decoded codepoint, unless any codepoint type is allowed. + // + // Note that the well-formedness sequence check guarantees that we decoded + // a valid Unicode codepoint (see above). + // + if (types_ == codepoint_types::any) + return make_pair (true, true); // Valid codepoint. + + // Check if the decoded codepoint is whitelisted. + // + using traits = u32string::traits_type; + + if (whitelist_ != nullptr && + traits::find (whitelist_, traits::length (whitelist_), codepoint_) != + nullptr) + return make_pair (true, true); // Valid codepoint. + + // Now check if the codepoint type matches the specified set. Note: also + // covers the `types_ == codepoint_types::none` case. + // + codepoint_types t (codepoint_type (codepoint_)); + + if ((t & types_) != codepoint_types::none) + return make_pair (true, true); // Valid codepoint. + + if (what != nullptr) + *what = "invalid Unicode codepoint (" + to_string (t) + ")"; + + return make_pair (false, true); // Invalid codepoint. + } + + inline std::pair utf8_validator:: + recover (char c) + { + // We are recovered if the character can be interpreted as a sequence + // leading byte. + // + // As an optimization, bail out if the byte is a sequence trailing byte + // (10xxxxxx). + // + if ((c & 0xC0) == 0x80) + return std::make_pair (false, false); // Invalid byte. + + seq_index_ = 0; + return validate (c); + } + + inline char32_t utf8_validator:: + codepoint () const + { + return codepoint_; + } +} diff --git a/libbutl/utf8.mxx b/libbutl/utf8.mxx new file mode 100644 index 0000000..15e8ded --- /dev/null +++ b/libbutl/utf8.mxx @@ -0,0 +1,130 @@ +// file : libbutl/utf8.mxx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +#ifndef __cpp_modules_ts +#pragma once +#endif + +// C includes. + +#ifndef __cpp_lib_modules_ts +#include +#include // uint8_t +#include // pair +#endif + +// Other includes. + +#ifdef __cpp_modules_ts +export module butl.utf8; +#ifdef __cpp_lib_modules_ts +import std.core; +#endif +import butl.unicode; +#else +#include +#endif + +#include + +LIBBUTL_MODEXPORT namespace butl +{ + // Here and below we will refer to bytes that encode a singe Unicode + // codepoint as "UTF-8 byte sequence" ("UTF-8 sequence" or "byte sequence" + // for short) and a sequence of such sequences as "UTF-8 encoded byte + // string" ("byte string" for short). + // + + // Validate a UTF-8 encoded byte string one byte at a time. Optionally, also + // validate that its decoded codepoints belong to the specified types or + // codepoint whitelist. + // + class utf8_validator + { + public: + // Note: use whitelist via shallow copy. + // + explicit + utf8_validator (codepoint_types = codepoint_types::any, + const char32_t* whitelist = nullptr); + + // Validate the next byte returning true if it is valid (first) and + // whether it is the last byte of a codepoint (second). The {false, true} + // result indicates a byte sequence decoded into a codepoint of undesired + // type rather than an invalid byte that happens to be the last in the + // sequence (and may well be a valid starting byte of the next sequence). + // + // Note that in case the byte is invalid, calling this function again + // without recovery is illegal. + // + std::pair + validate (char); + + // As above but in case of an invalid byte also return the description of + // why it is invalid. + // + // Note that the description only contains the reason why the specified + // byte is not part of a valid UTF-8 sequence or the desired codepoint + // type, for example: + // + // "invalid UTF-8 sequence first byte (0xB0)" + // "invalid Unicode codepoint (reserved)" + // + // It can be used to form complete diagnostics along these lines: + // + // cerr << "invalid manifest value " << name << ": " << what << endl; + // + std::pair + validate (char, std::string& what); + + // As above but decide whether the description is needed at runtime (what + // may be NULL). + // + std::pair + validate (char, std::string* what); + + // Recover from an invalid byte. + // + // This function must be called with the first invalid and then subsequent + // bytes until it signals that the specified byte is valid. Note that it + // shall not be called if the sequence is decoded into a codepoint of an + // undesired type. + // + // Note also that a byte being invalid in the middle of a UTF-8 sequence + // may be valid as a first byte of the next sequence. + // + std::pair + recover (char); + + // Return the codepoint of the last byte sequence. + // + // This function can only be legally called after validate() or recover() + // signal that the preceding byte is valid and last. + // + char32_t + codepoint () const; + + private: + codepoint_types types_; + const char32_t* whitelist_; + + // State machine. + // + uint8_t seq_size_; // [1 4]; calculated at the first byte validation. + uint8_t seq_index_ = 0; // [0 3] + + // Last byte sequence decoded codepoint (built incrementally). + // + char32_t codepoint_; + + // The byte range a valid UTF-8 sequence second byte must belong to as + // calculated during the first byte validation. + // + // Note that the subsequent (third and forth) bytes must belong to the + // [80 BF] range regardless to the previous bytes. + // + std::pair byte2_range_; + }; +} + +#include diff --git a/libbutl/utility.cxx b/libbutl/utility.cxx index ce78295..d6a21c6 100644 --- a/libbutl/utility.cxx +++ b/libbutl/utility.cxx @@ -35,6 +35,9 @@ import std.io; #endif #endif +import butl.utf8; +#else +#include #endif namespace butl @@ -191,6 +194,135 @@ namespace butl } void + to_utf8 (string& s, char repl, codepoint_types ts, const char32_t* wl) + { + using iterator = string::iterator; + + utf8_validator val (ts, wl); + + iterator i (s.begin ()); // Source current position. + iterator e (s.end ()); // Source end position. + iterator d (i); // Destination current position. + iterator b (d); // Begin of the current destination sequence. + + // Replace the current byte and prepare for the next sequence. + // + auto replace_byte = [&d, &b, repl] () + { + *d++ = repl; + b = d; + }; + + // Replace bytes of the current sequence excluding the current byte and + // prepare for the next sequence. + // + auto replace_sequence = [&d, &b, repl] () + { + for (; b != d; ++b) + *b = repl; + }; + + // Replace sequence bytes with a single replacement byte and prepare for + // the next sequence. + // + auto replace_codepoint = [&d, &b, &replace_byte] () + { + d = b; // Rewind to the beginning of the sequence. + replace_byte (); + }; + + // Iterate over the byte string appending valid bytes, replacing invalid + // bytes/codepoints, and recovering after invalid bytes. + // + for (; i != e; ++i) + { + char c (*i); + pair v (val.validate (c)); + + // Append a valid byte and prepare for the next sequence if the sequence + // end is reached. + // + auto append_byte = [&d, &b, &v, &c] () + { + *d++ = c; + + if (v.second) // Sequence last byte? + b = d; + }; + + // If this is a valid byte/codepoint, then append the byte and proceed + // to the next string byte. + // + if (v.first) + { + append_byte (); + continue; + } + + // If this is an invalid codepoint, then replace the sequence with a + // single replacement character and proceed to the next byte sequence + // (no recovery is necessary). + // + if (v.second) + { + replace_codepoint (); + continue; + } + + // Now, given this is an invalid byte, replace the current sequence + // bytes and recover. + // + replace_sequence (); + + // Stay in the recovery cycle until a valid byte is encountered. Note + // that we start from where we left off, not from the next byte (see + // utf8_validator::recover() for details). + // + for (; i != e; ++i) + { + c = *i; + v = val.recover (c); + + // End the recovery cycle for a valid byte. + // + if (v.first) + { + append_byte (); + break; + } + + // End the recovery cycle for a decoded but invalid (ASCII-range) + // codepoint. + // + if (v.second) + { + replace_codepoint (); + break; + } + + replace_byte (); + } + + // Bail out if we reached the end of the byte string. Note that while we + // failed to recover (otherwise i != e), all the bytes are already + // replaced. + // + if (i == e) + break; + } + + // If the last byte sequence is incomplete, then replace its bytes. + // + if (b != d) + replace_sequence (); + + // Shrink the byte string if we replaced any invalid codepoints. + // + if (d != e) + s.resize (d - s.begin ()); + } + + void setenv (const string& name, const string& value) { #ifndef _WIN32 diff --git a/libbutl/utility.ixx b/libbutl/utility.ixx index c5fdbac..27ef7fb 100644 --- a/libbutl/utility.ixx +++ b/libbutl/utility.ixx @@ -2,8 +2,11 @@ // license : MIT; see accompanying LICENSE file #ifndef __cpp_lib_modules_ts -#include // getenv() -#include +#include // toupper(), tolower(), is*() +#include // isw*() +#include // getenv() +#include // for_each() +#include // invalid_argument #endif namespace butl @@ -216,44 +219,84 @@ namespace butl return sanitize_identifier (std::string (s)); } - inline codepoint_types - operator&= (codepoint_types& x, codepoint_types y) + inline bool + eof (std::istream& is) { - return x = static_cast ( - static_cast (x) & - static_cast (y)); + if (!is.fail ()) + return false; + + if (is.eof ()) + return true; + + throw std::istream::failure (""); } - inline codepoint_types - operator|= (codepoint_types& x, codepoint_types y) + inline optional + utf8_length_impl (const std::string& s, + std::string* what, + codepoint_types ts, + const char32_t* wl) { - return x = static_cast ( - static_cast (x) | - static_cast (y)); + using namespace std; + + // Optimize for an empty string. + // + if (s.empty ()) + return 0; + + size_t r (0); + pair v; + utf8_validator val (ts, wl); + + for (char c: s) + { + v = val.validate (c, what); + + if (!v.first) // Invalid byte? + return nullopt; + + if (v.second) // Last byte in the sequence? + ++r; + } + + // Make sure that the last UTF-8 sequence is complete. + // + if (!v.second) + { + if (what != nullptr) + *what = "incomplete UTF-8 sequence"; + + return nullopt; + } + + return r; } - inline codepoint_types - operator& (codepoint_types x, codepoint_types y) + inline std::size_t + utf8_length (const std::string& s, codepoint_types ts, const char32_t* wl) { - return x &= y; + using namespace std; + + string what; + if (optional r = utf8_length_impl (s, &what, ts, wl)) + return *r; + + throw invalid_argument (what); } - inline codepoint_types - operator| (codepoint_types x, codepoint_types y) + inline bool + utf8 (const std::string& s, + std::string& what, + codepoint_types ts, + const char32_t* wl) { - return x |= y; + return utf8_length_impl (s, &what, ts, wl).has_value (); } inline bool - eof (std::istream& is) + utf8 (const std::string& s, codepoint_types ts, const char32_t* wl) { - if (!is.fail ()) - return false; - - if (is.eof ()) - return true; - - throw std::istream::failure (""); + return utf8_length_impl (s, nullptr, ts, wl).has_value (); } inline optional diff --git a/libbutl/utility.mxx b/libbutl/utility.mxx index 71c2860..b84e731 100644 --- a/libbutl/utility.mxx +++ b/libbutl/utility.mxx @@ -16,13 +16,10 @@ #include // ostream #include #include // size_t -#include // move(), forward() +#include // move(), forward(), pair #include // strcmp(), strlen() #include // exception, uncaught_exception[s]() //#include // hash - -#include // toupper(), tolower(), is*() -#include // isw*() #endif #include // thread_local @@ -34,8 +31,12 @@ export module butl.utility; import std.core; import std.io; #endif +import butl.utf8; +import butl.unicode; import butl.optional; #else +#include +#include #include #endif @@ -194,41 +195,44 @@ LIBBUTL_MODEXPORT namespace butl std::string sanitize_identifier (std::string&&); std::string sanitize_identifier (const std::string&); - // Return true if the string is a valid UTF-8 encoded byte sequence and, - // optionally, its decoded codepoints belong to the specified types or to - // the codepoint whitelist. - // - // Note that the Unicode Standard considers a UTF-8 byte sequence decoded - // into a codepoint of the surrogate type as invalid. Thus, the surrogate - // type may not be specified. + // Return true if the string is a valid UTF-8 encoded byte string and, + // optionally, its decoded codepoints belong to the specified types or + // codepoint whitelist. // - enum class codepoint_types: std::uint16_t - { - // Useful to only allow the whitelisted codepoints or when building the - // type set incrementally. - // - none = 0x00, - - graphic = 0x01, // L(etter), M(ark), N(number), P(uncturation), - // S(symbol), Zs(separator, space) - format = 0x02, - control = 0x04, - private_use = 0x08, - non_character = 0x10, - reserved = 0x20, - - any = 0x3f - }; + bool + utf8 (const std::string&, + codepoint_types = codepoint_types::any, + const char32_t* whitelist = nullptr); - LIBBUTL_SYMEXPORT bool + // As above but in case of an invalid sequence also return the description + // of why it is invalid. + // + bool utf8 (const std::string&, + std::string& what, codepoint_types = codepoint_types::any, const char32_t* whitelist = nullptr); - codepoint_types operator& (codepoint_types, codepoint_types); - codepoint_types operator| (codepoint_types, codepoint_types); - codepoint_types operator&= (codepoint_types&, codepoint_types); - codepoint_types operator|= (codepoint_types&, codepoint_types); + // Return UTF-8 byte string length in codepoints. Throw + // std::invalid_argument if this is not a valid UTF-8. + // + std::size_t + utf8_length (const std::string&, + codepoint_types = codepoint_types::any, + const char32_t* whitelist = nullptr); + + // Fixup the specified string (in place) to be valid UTF-8 replacing invalid + // bytes and codepoints with the specified character, for example, '?'. + // + // Potential future improvements: + // - char32_t replacement (will need UTF-8 encoding) + // - different replacement for bytes and codepoints + // + LIBBUTL_SYMEXPORT void + to_utf8 (std::string&, + char replacement, + codepoint_types = codepoint_types::any, + const char32_t* whitelist = nullptr); // If an input stream is in a failed state, then return true if this is // because of the eof and throw istream::failure otherwise. If the stream diff --git a/tests/manifest-parser/driver.cxx b/tests/manifest-parser/driver.cxx index 57674cb..a34f2b7 100644 --- a/tests/manifest-parser/driver.cxx +++ b/tests/manifest-parser/driver.cxx @@ -40,6 +40,9 @@ namespace butl static bool equal (const optional& actual, const optional& expected); + static pairs + parse (const char* m, manifest_parser::filter_function f = {}); + // Test manifest as it is represented in the stream, including format // version and end-of-manifest values. // @@ -188,6 +191,41 @@ namespace butl assert (p.first == "" && p.second == "comment"); } + // UTF-8. + // + assert (test (":1\n#\xD0\xB0\n\xD0\xB0y\xD0\xB0:\xD0\xB0z\xD0\xB0", + {{"","1"}, + {"\xD0\xB0y\xD0\xB0", "\xD0\xB0z\xD0\xB0"}, + {"",""}, + {"",""}})); + + assert (fail (":1\n#\xD0\n\xD0\xB0y\xD0\xB0:\xD0\xB0z\xD0\xB0")); + assert (fail (":1\n#\xD0\xB0\n\xB0y\xD0\xB0:\xD0\xB0z\xD0\xB0")); + assert (fail (":1\n#\xD0\xB0\n\xD0y\xD0\xB0:\xD0\xB0z\xD0\xB0")); + assert (fail (":1\n#\xD0\xB0\n\xD0\xB0y\xD0:\xD0\xB0z\xD0\xB0")); + assert (fail (":1\n#\xD0\xB0\n\xD0\xB0y\xD0\xB0:\xD0z\xD0\xB0")); + assert (fail (":1\n#\xD0\xB0\n\xD0\xB0y\xD0\xB0:\xD0\xB0z\xD0")); + assert (fail (":1\r\r\xB0#\n\xD0\xB0y\xD0\xB0:\xD0\xB0z\xD0\xB0")); + assert (fail (":1\r\xD0#\n\xD0\xB0y\xD0\xB0:\xD0\xB0z\xD0\xB0")); + assert (fail (":1\n#\xD0\xB0\n\xD0\xB0y\xD0\xB0:\xD0\xB0z\xD0\xB0\r\xD0")); + + // Test parsing failure for manifest with multi-byte UTF-8 sequences + // (the column is properly reported, etc). + // + try + { + parse (":1\na\xD0\xB0\xD0\xB0\xFE"); + assert (false); + } + catch (const manifest_parsing& e) + { + assert (e.line == 2 && + e.column == 4 && + e.description == + "invalid manifest name: " + "invalid UTF-8 sequence first byte (0xFE)"); + } + // Filtering. // assert (test (":1\na: abc\nb: bca\nc: cab", @@ -281,7 +319,7 @@ namespace butl } static pairs - parse (const char* m, manifest_parser::filter_function f = {}) + parse (const char* m, manifest_parser::filter_function f) { istringstream is (m); is.exceptions (istream::failbit | istream::badbit); diff --git a/tests/manifest-rewriter/driver.cxx b/tests/manifest-rewriter/driver.cxx index fd76929..ec73d81 100644 --- a/tests/manifest-rewriter/driver.cxx +++ b/tests/manifest-rewriter/driver.cxx @@ -90,6 +90,10 @@ namespace butl {{"abc", "xyz"}}) == ":1\n abc: \\\nxyz\n\\"); + assert (edit (":1\n a\xD0\xB0g : b", + {{"a\xD0\xB0g", "xyz"}}) == + ":1\n a\xD0\xB0g : \\\nxyz\n\\"); + // Test editing of manifests that contains CR characters. // assert (edit (":1\r\na: b\r\r\n", {{"a", "xyz"}}) == ":1\r\na: xyz\r\r\n"); diff --git a/tests/manifest-serializer/driver.cxx b/tests/manifest-serializer/driver.cxx index 148a281..c818b4a 100644 --- a/tests/manifest-serializer/driver.cxx +++ b/tests/manifest-serializer/driver.cxx @@ -46,6 +46,7 @@ main () assert (test ({{"#", "x"}}, "# x\n")); assert (test ({{"#", "x"},{"#", "y"},{"#", ""}}, "# x\n# y\n#\n")); assert (fail ({{"",""},{"#", "x"}})); // serialization after eos + assert (fail ({{"#", "\xB0"}})); // invalid UTF-8 sequence // Empty manifest stream. // @@ -89,6 +90,12 @@ main () assert (fail ({{"","1"},{"a b",""}})); assert (fail ({{"","1"},{"a\tb",""}})); assert (fail ({{"","1"},{"a\n",""}})); + assert (fail ({{"","1"},{"a\xB0",""}})); // invalid UTF-8 sequence + + // Invalid value. + // + assert (fail ({{"","1"},{"a","\xB0"}})); // invalid UTF-8 sequence + assert (fail ({{"","1"},{"a","\xD0"}})); // incomplete UTF-8 sequence // Simple value. // @@ -172,11 +179,22 @@ main () "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\\Y\\\n" "yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy"); + // Hard break after the UTF-8/delayed hard break. + // + string l6 ("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" + "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\xF0\x90\x8C\x82" + "\xF0\x90\x8C\x82yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy"); + + string e6 ("xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx" + "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\xF0\x90\x8C\x82\\\n" + "\xF0\x90\x8C\x82yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy"); + assert (test ({{"","1"},{"a",l1},{"",""},{"",""}}, ": 1\na: " + e1 + "\n")); assert (test ({{"","1"},{"a",l2},{"",""},{"",""}}, ": 1\na: " + e2 + "\n")); assert (test ({{"","1"},{"a",l3},{"",""},{"",""}}, ": 1\na: " + e3 + "\n")); assert (test ({{"","1"},{"a",l4},{"",""},{"",""}}, ": 1\na: " + e4 + "\n")); assert (test ({{"","1"},{"a",l5},{"",""},{"",""}}, ": 1\na: " + e5 + "\n")); + assert (test ({{"","1"},{"a",l6},{"",""},{"",""}}, ": 1\na: " + e6 + "\n")); // Multi-line value. // diff --git a/tests/utf8/driver.cxx b/tests/utf8/driver.cxx index 8480dec..f35e65e 100644 --- a/tests/utf8/driver.cxx +++ b/tests/utf8/driver.cxx @@ -13,8 +13,10 @@ #ifdef __cpp_lib_modules_ts import std.core; #endif +import butl.utf8; import butl.utility; #else +#include #include #endif @@ -24,6 +26,17 @@ using namespace butl; int main () { + // utf8() tests. + // + auto utf8_error = [] (const string& s, + codepoint_types ts = codepoint_types::any, + const char32_t* wl = nullptr) + { + string error; + assert (!utf8 (s, error, ts, wl)); + return error; + }; + // Valid sequences. // // Empty. @@ -43,18 +56,36 @@ main () // Ill-formed sequences. // + // Long sequences. + // + assert (!utf8 ("\xF8")); // 5-byte sequence. + assert (!utf8 ("\xFC")); // 6-byte sequence. + + assert (utf8_error ("\xF8") == "5-byte length UTF-8 sequence"); + assert (utf8_error ("\xFC") == "6-byte length UTF-8 sequence"); + assert (utf8_error ("\xFE") == "invalid UTF-8 sequence first byte (0xFE)"); + // 2-byte sequences. // assert (!utf8 ("\xC1\x80")); // Invalid first byte. assert (!utf8 ("\xD0y")); // Invalid second byte. + assert (utf8_error ("\xC1\x80") == + "invalid UTF-8 sequence first byte (0xC1)"); + + assert (utf8_error ("\xD0y") == + "invalid UTF-8 sequence second byte (0x79 'y')"); + // 3-byte sequences. // assert (!utf8 ("\xE2\x70\x80")); // Invalid second byte. assert (!utf8 ("\xE2\x80\x70")); // Invalid third byte. - assert (!utf8 ("\xED\xA0\x80")); // Min UTF-16 surrogate value. - assert (!utf8 ("\xED\xBF\xBF")); // Max UTF-16 surrogate value. + assert (!utf8 ("\xED\xA0\x80")); // Min UTF-16 surrogate. + assert (!utf8 ("\xED\xBF\xBF")); // Max UTF-16 surrogate. + + assert (utf8_error ("\xE2\x80\x70") == + "invalid UTF-8 sequence third byte (0x70 'p')"); // 4-byte sequences. // @@ -63,9 +94,8 @@ main () assert (!utf8 ("\xF0\x90\x70\x80")); // Invalid third byte. assert (!utf8 ("\xF1\x80\x80\xC0")); // Invalid forth byte. - // Out of the codepoint range (0x10ffff + 1). - // - assert (!utf8 ("\xF4\x90\x80\x80")); + assert (utf8_error ("\xF1\x80\x80\xC0") == + "invalid UTF-8 sequence forth byte (0xC0)"); // Incomplete sequences. // @@ -73,14 +103,25 @@ main () assert (!utf8 ("\xE4\xBA")); // 3-byte sequence. assert (!utf8 ("\xF0\x90\x8C")); // 4-byte sequence. + assert (utf8_error ("\xD0") == "incomplete UTF-8 sequence"); + // Missing sequence leading bytes. // - assert (!utf8 ("\xB0xyz")); // 2-byte sequence. - assert (!utf8 ("\xBA\x8Cxyz")); // 3-byte sequence. - assert (!utf8 ("\x8Cxyz")); // 3-byte sequence. - assert (!utf8 ("\x90\x8C\x82xyz")); // 4-byte sequence. - assert (!utf8 ("\x8C\x82xyz")); // 4-byte sequence. - assert (!utf8 ("\x82xyz")); // 4-byte sequence. + assert (!utf8 ("\xB0xyz")); // 2-byte sequence. + assert (!utf8 ("\xBA\x8C\xD0\xB0yz")); // 3-byte sequence. + assert (!utf8 ("\x8Cxyz")); // 3-byte sequence. + assert (!utf8 ("\x90\x8C\x82xyz")); // 4-byte sequence. + assert (!utf8 ("\x8C\x82xyz")); // 4-byte sequence. + assert (!utf8 ("\x82xyz")); // 4-byte sequence. + + assert (utf8_error ("\xB0") == "invalid UTF-8 sequence first byte (0xB0)"); + + // Above the valid codepoint range (0x10ffff + 1). + // + assert (!utf8 ("\xF4\x90\x80\x80")); + + assert (utf8_error ("\xF4\x90\x80\x80") == + "invalid UTF-8 sequence second byte (0x90)"); // Whitelisting. // @@ -145,6 +186,9 @@ main () assert (!utf8 ("\xF3\xA1\x80\x80", codepoint_types::graphic)); // Reserved. assert (!utf8 ("\xF3\xA0\x81\xBF", codepoint_types::graphic)); // Format. + assert (utf8_error ("\xF3\xA0\x81\xBF", codepoint_types::graphic) == + "invalid Unicode codepoint (format)"); + assert (!utf8 ("\xC2\xAC", codepoint_types::format)); // Graphic. // Private-use & Graphic. @@ -153,4 +197,145 @@ main () codepoint_types::format)); assert (!utf8 ("a", codepoint_types::none)); // None. + + assert (utf8_error ("a", codepoint_types::none) == + "invalid Unicode codepoint (graphic)"); + + // UTF-8 string length. + // + auto invalid_utf8 = [] (string s, codepoint_types ts = codepoint_types::any) + { + try + { + utf8_length (s, ts); + return false; + } + catch (const invalid_argument&) + { + return true; + } + }; + + assert (utf8_length ("") == 0); + assert (utf8_length ("x\xD0\xB0\xE4\xBA\x8C\xF0\x90\x8C\x82y") == 5); + + assert (invalid_utf8 ("\xFE")); // Invalid byte. + assert (invalid_utf8 ("\xD0")); // Incomplete. + assert (invalid_utf8 ("\n", codepoint_types::graphic)); // Invalid codepoint. + + // to_utf8() tests. + // + auto roundtrip = [] (const char* s) + { + string r (s); + to_utf8 (r, '?'); + return r == s; + }; + + auto sanitize = [] (string s, codepoint_types ts = codepoint_types::any) + { + to_utf8 (s, '?', ts); + return s; + }; + + // Empty. + // + assert (roundtrip ("")); + + // 1 code point. + // + assert (roundtrip ("a")); // 1 byte. + assert (roundtrip ("\xD0\xB0")); // 2 bytes. + assert (roundtrip ("\xE4\xBA\x8C")); // 3 bytes. + assert (roundtrip ("\xF0\x90\x8C\x82")); // 4 bytes. + + // Multiple code points. + // + assert (roundtrip ("a\xD0\xB0\xE4\xBA\x8C\xF0\x90\x8C\x82")); + + // Ill-formed sequences. + // + // Long sequence. + // + assert (sanitize ("\xF8") == "?"); // 5-byte sequence. + + // Invalid first byte followed by a second byte which ... + // + assert (sanitize ("\xC1\x80") == "??"); // is a trailing byte. + assert (sanitize ("\xC1y") == "?y"); // starts 1-byte sequence. + assert (sanitize ("\xC1\xD0\xB0") == "?\xD0\xB0"); // starts 2-byte sequence. + assert (sanitize ("\xC1\xFE") == "??"); // is not UTF-8. + + // Invalid second byte which ... + // + assert (sanitize ("\xD0y") == "?y"); // starts 1-byte sequence. + assert (sanitize ("\xD0\xD0\xB0") == "?\xD0\xB0"); // starts 2-byte sequence. + assert (sanitize ("\xD0\xFE") == "??"); // is not UTF-8. + + // Incomplete sequences. + // + assert (sanitize ("\xD0") == "?"); // 2-byte sequence. + assert (sanitize ("y\xD0") == "y?"); // 2-byte sequence. + assert (sanitize ("\xE4\xBA") == "??"); // 3-byte sequence. + assert (sanitize ("\xD0\xD0") == "??"); // 2-byte sequence. + + // Incomplete recovery. + // + assert (sanitize ("\xD0\xFE") == "??"); // 2-byte sequence. + assert (sanitize ("\xD0\xFE\xFE") == "???"); // 2-byte sequence. + + assert (sanitize ("\xF4\x90\x80\x80") == "????"); // Above the range. + assert (sanitize ("\xED\xA0\x80") == "???"); // Min UTF-16 surrogate. + assert (sanitize ("\xED\xBF\xBF") == "???"); // Max UTF-16 surrogate. + + // Invalid codepoints. + // + auto sanitize_g = [&sanitize] (string s) + { + return sanitize (move (s), codepoint_types::graphic); + }; + + assert (sanitize_g ("\xEF\xB7\x90") == "?"); + assert (sanitize_g ("y\xEF\xB7\x90") == "y?"); + assert (sanitize_g ("\xEF\xB7\x90y") == "?y"); + + // Invalid during recovery. + // + assert (sanitize_g ("\xD0\n") == "??"); + assert (sanitize_g ("\xD0\ny") == "??y"); + assert (sanitize_g ("\xD0\xFE\n") == "???"); + + assert (sanitize_g ("\xD0\xEF\xB7\x90") == "??"); + + // utf8_validator::codepoint() tests. + // + { + u32string r; + size_t invalid_codepoints (0); + + string s ("a" + "\xD0\xB0" + "\n" // Control. + "\xE4\xBA\x8C" + "\xEE\x80\x80" // Private-use. + "\xF0\x90\x8C\x82"); + + utf8_validator val (codepoint_types::graphic); + + for (char c: s) + { + pair v (val.validate (c)); + + if (v.first) + { + if (v.second) + r.push_back (val.codepoint ()); + } + else + ++invalid_codepoints; + } + + assert (r == U"a\x430\x4E8C\x10302"); + assert (invalid_codepoints == 2); + } } -- cgit v1.1