From acd7a29c02e222e27d474fe3b64b1ca11b3506c0 Mon Sep 17 00:00:00 2001 From: Karen Arutyunov Date: Wed, 18 Dec 2019 22:32:16 +0300 Subject: Add utf8() predicate --- libbutl/utf8.cxx | 343 ++++++++++++++++++++++++++++++++++++++++++++++++++ libbutl/utility.ixx | 28 +++++ libbutl/utility.mxx | 46 ++++++- tests/utf8/buildfile | 7 ++ tests/utf8/driver.cxx | 157 +++++++++++++++++++++++ 5 files changed, 575 insertions(+), 6 deletions(-) create mode 100644 libbutl/utf8.cxx create mode 100644 tests/utf8/buildfile create mode 100644 tests/utf8/driver.cxx diff --git a/libbutl/utf8.cxx b/libbutl/utf8.cxx new file mode 100644 index 0000000..e2a42bd --- /dev/null +++ b/libbutl/utf8.cxx @@ -0,0 +1,343 @@ +// file : libbutl/utf8.cxx -*- C++ -*- +// copyright : Copyright (c) 2014-2019 Code Synthesis Ltd +// license : MIT; see accompanying LICENSE file + +#ifndef __cpp_modules_ts +#include +#endif + +#ifndef __cpp_lib_modules_ts +#include +#include + +#include // lower_bound() +#endif + +#ifdef __cpp_modules_ts +module butl.utility; + +// Only imports additional to interface. +#ifdef __clang__ +#ifdef __cpp_lib_modules_ts +import std.core; +import std.io; +#endif +#endif + +#endif + +namespace butl +{ + using namespace std; + + // Sorted arrays of the Unicode codepoint ranges corresponding to the + // codepoint types. Note that code type range lists (but not ranges + // themselves) may overlap. + // + // Note that the graphic type codepoints are numerous and scattered. Thus, + // we will consider a codepoint to be of the graphic type if it is not of + // any other type. + // + using codepoint_range = pair; + + static const codepoint_range cn_rs[] = // Control. + { + {0x00, 0x1F}, + {0x7F, 0x9F} + }; + + static const codepoint_range fr_rs[] = // Format. + { + {0x000AD, 0x000AD}, + {0x00600, 0x00605}, + {0x0061C, 0x0061C}, + {0x006DD, 0x006DD}, + {0x0070F, 0x0070F}, + {0x008E2, 0x008E2}, + {0x0180E, 0x0180E}, + {0x0200B, 0x0200F}, + {0x0202A, 0x0202E}, + {0x02060, 0x02064}, + {0x02066, 0x0206F}, + {0x0FEFF, 0x0FEFF}, + {0x0FFF9, 0x0FFFB}, + {0x110BD, 0x110BD}, + {0x110CD, 0x110CD}, + {0x13430, 0x13438}, + {0x1BCA0, 0x1BCA3}, + {0x1D173, 0x1D17A}, + {0xE0001, 0xE0001}, + {0xE0020, 0xE007F} + }; + + static const codepoint_range pr_rs[] = // Private-use. + { + {0x00E000, 0x00F8FF}, + {0x0F0000, 0x10FFFF} + }; + + static const codepoint_range nc_rs[] = // Non-character. + { + {0xFDD0, 0xFDEF} + }; + + static const codepoint_range rs_rs[] = // Reserved. + { + {0x30000, 0xE0000}, + {0xE0002, 0xE001F}, + {0xE0080, 0xE00FF}, + {0xE01F0, 0xEFFFF} + }; + + struct codepoint_type_ranges + { + codepoint_types type; + const codepoint_range* begin; + const codepoint_range* end; + }; + + static const codepoint_type_ranges ct_ranges[] = + { + { + codepoint_types::control, + cn_rs, + cn_rs + sizeof (cn_rs) / sizeof (*cn_rs) + }, + { + codepoint_types::format, + fr_rs, + fr_rs + sizeof (fr_rs) / sizeof (*fr_rs) + }, + { + codepoint_types::private_use, + pr_rs, + pr_rs + sizeof (pr_rs) / sizeof (*pr_rs) + }, + { + codepoint_types::non_character, + nc_rs, + nc_rs + sizeof (nc_rs) / sizeof (*nc_rs) + }, + { + codepoint_types::reserved, + rs_rs, + rs_rs + sizeof (rs_rs) / sizeof (*rs_rs) + } + }; + + bool + utf8 (const string& s, codepoint_types ts, const char32_t* wl) + { + // A UCS-4 character is encoded as the UTF-8 byte sequence as follows, + // depending on the value range it falls into: + // + // 0x00000000 - 0x0000007F: + // 0xxxxxxx + // + // 0x00000080 - 0x000007FF: + // 110xxxxx 10xxxxxx + // + // 0x00000800 - 0x0000FFFF: + // 1110xxxx 10xxxxxx 10xxxxxx + // + // 0x00010000 - 0x001FFFFF: + // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + // + // 0x00200000 - 0x03FFFFFF: + // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + // + // 0x04000000 - 0x7FFFFFFF: + // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + // + // Also note that the Unicode Standard (as of 12.1) specifies no + // codepoints above 0x10FFFF, so we consider 5- and 6-byte length UTF-8 + // sequences as invalid (we could have added `unspecified` codepoint type + // except that there are no UTF-8 validation tables defined for these + // sequences). + // + size_t n (s.size ()); + + for (size_t i (0); i != n; ) + { + // Detect the UTF-8 byte sequence length based on its first byte. While + // at it, start calculating the Unicode codepoint value. + // + size_t sn; + char32_t c; + unsigned char b1 (s[i]); + + if (b1 < 0x80) + { + sn = 1; + c = b1; + } + else if (b1 < 0xE0) + { + sn = 2; + c = b1 & 0x1F; // Takes 5 rightmost bits of the first sequence byte. + } + else if (b1 < 0xF0) + { + sn = 3; + c = b1 & 0xF; // Takes 4 rightmost bits of the first sequence byte. + } + else if (b1 < 0xF8) + { + sn = 4; + c = b1 & 0x7; // Takes 3 rightmost bits of the first sequence byte. + } + else + return false; // The byte starts 5- or 6-byte length sequence. + + // Bail out if the string doesn't contain all the requred codepoint + // encoding bytes. + // + if (sn > n - i) + return false; + + // Note that while a codepoint may potentially be encoded with byte + // sequences of different lengths, only the shortest encoding sequence + // is considered well-formed. Also a well-formed sequence may not be + // decoded into a UTF-16 surrogate value ([D800 DFFF]) or a value that + // is greater than the max codepoint value (0x10FFFF). We will check all + // that using the Well-Formed UTF-8 Byte Sequences table (provided by + // the Unicode 12.0 Standard) which also takes care of the missing UTF-8 + // sequence bytes. + // + // Return true if a byte value belongs to the specified range. + // + auto belongs = [] (unsigned char c, unsigned char l, unsigned char r) + { + return c >= l && c <= r; + }; + + switch (sn) + { + case 1: break; // Always well-formed by the definition (see above). + case 2: + { + // [000080 0007FF]: [C2 DF] [80 BF] + // + // Check the first/second bytes combinations: + // + if (!(belongs (b1, 0xC2, 0xDF) && belongs (s[i + 1], 0x80, 0xBF))) + return false; + + break; + } + case 3: + { + // [000800 000FFF]: E0 [A0 BF] [80 BF] + // [001000 00CFFF]: [E1 EC] [80 BF] [80 BF] + // [00D000 00D7FF]: ED [80 9F] [80 BF] ; Excludes surrogates. + // [00E000 00FFFF]: [EE EF] [80 BF] [80 BF] + // + unsigned char b2 (s[i + 1]); + + if (!((b1 == 0xE0 && belongs (b2, 0xA0, 0xBF)) || + (belongs (b1, 0xE1, 0xEC) && belongs (b2, 0x80, 0xBF)) || + (b1 == 0xED && belongs (b2, 0x80, 0x9F)) || + (belongs (b1, 0xEE, 0xEF) && belongs (b2, 0x80, 0xBF))) || + !belongs (s[i + 2], 0x80, 0xBF)) + return false; + + break; + } + case 4: + { + // [010000 03FFFF]: F0 [90 BF] [80 BF] [80 BF] + // [040000 0FFFFF]: [F1 F3] [80 BF] [80 BF] [80 BF] + // [100000 10FFFF]: F4 [80 8F] [80 BF] [80 BF] + // + unsigned char b2 (s[i + 1]); + + if (!((b1 == 0xF0 && belongs (b2, 0x90, 0xBF)) || + (belongs (b1, 0xF1, 0xF3) && belongs (b2, 0x80, 0xBF)) || + (b1 == 0xF4 && belongs (b2, 0x80, 0x8F))) || + !belongs (s[i + 2], 0x80, 0xBF) || + !belongs (s[i + 3], 0x80, 0xBF)) + return false; + + break; + } + } + + // For the remaining sequence bytes, "append" their 6 rightmost bits to + // the resulting codepoint value. + // + --sn; + ++i; + + for (size_t n (i + sn); i != n; ++i) + c = (c << 6) | (s[i] & 0x3F); + + // Check the decoded codepoint, unless any codepoint type is allowed. + // + if (ts == codepoint_types::any) + continue; + + using traits = u32string::traits_type; + + // Check if the decoded codepoint is whitelisted. + // + if (wl != nullptr && + traits::find (wl, traits::length (wl), c) != nullptr) + continue; + + // Match the decoded codepoint type against the specified type set. + // + // Detect the codepoint type (see the Types of Code Points table in the + // Unicode 12.0 Standard for details). + // + codepoint_types ct; + + // Optimize for the common case (printable ASCII characters). + // + if (c >= 0x20 && c <= 0x7E) + ct = codepoint_types::graphic; + else if ((c & 0xFFFF) >= 0xFFFE) // Non-range based detection. + ct = codepoint_types::non_character; + else + { + // Note that we consider a codepoint to be of the graphic type if it + // is not of any other type (see above). + // + ct = codepoint_types::graphic; + + // Note that the codepoint type range lists may overlap. Thus, we + // iterate over all of them until there is a match. + // + for (size_t i (0); i != sizeof (ct_ranges) / sizeof (*ct_ranges); ++i) + { + const codepoint_type_ranges& rs (ct_ranges[i]); + + // Find the range that either contains the codepoint or lays to the + // right of it. Note that here we assume a range to be less than a + // codepoint if it lays to the left of the codepoint. + // + const codepoint_range* r ( + lower_bound (rs.begin, rs.end, + c, + [] (const codepoint_range& r, char32_t c) + { + return r.second < c; + })); + + if (r != rs.end && r->first <= c) // Contains the codepoint? + { + ct = rs.type; + break; + } + } + } + + // Now check if the codepoint type matches the specified set. Note: also + // covers the `ts == codepoint_types::none` case. + // + if ((ct & ts) == codepoint_types::none) + return false; + } + + return true; + } +} diff --git a/libbutl/utility.ixx b/libbutl/utility.ixx index d8a5ee8..aabaef6 100644 --- a/libbutl/utility.ixx +++ b/libbutl/utility.ixx @@ -217,6 +217,34 @@ namespace butl return sanitize_identifier (std::string (s)); } + inline codepoint_types + operator&= (codepoint_types& x, codepoint_types y) + { + return x = static_cast ( + static_cast (x) & + static_cast (y)); + } + + inline codepoint_types + operator|= (codepoint_types& x, codepoint_types y) + { + return x = static_cast ( + static_cast (x) | + static_cast (y)); + } + + inline codepoint_types + operator& (codepoint_types x, codepoint_types y) + { + return x &= y; + } + + inline codepoint_types + operator| (codepoint_types x, codepoint_types y) + { + return x |= y; + } + inline bool eof (std::istream& is) { diff --git a/libbutl/utility.mxx b/libbutl/utility.mxx index 11aa013..3bb335a 100644 --- a/libbutl/utility.mxx +++ b/libbutl/utility.mxx @@ -192,8 +192,44 @@ LIBBUTL_MODEXPORT namespace butl // Note that it doesn't make sure the first character is not a digit. // std::string& sanitize_identifier (std::string&); - std::string sanitize_identifier (std::string&&); - std::string sanitize_identifier (const std::string&); + std::string sanitize_identifier (std::string&&); + std::string sanitize_identifier (const std::string&); + + // Return true if the string is a valid UTF-8 encoded byte sequence and, + // optionally, its decoded codepoints belong to the specified types or to + // the codepoint whitelist ("\r\n\t" by default), unless it is NULL. + // + // Note that the Unicode Standard considers a UTF-8 byte sequence decoded + // into a codepoint of the surrogate type as invalid. Thus, the surrogate + // type may not be specified. + // + enum class codepoint_types: std::uint16_t + { + // Useful to only allow the whitelisted codepoints or when building the + // type set incrementally. + // + none = 0x00, + + graphic = 0x01, // L(etter), M(ark), N(number), P(uncturation), + // S(symbol), Zs(separator, space) + format = 0x02, + control = 0x04, + private_use = 0x08, + non_character = 0x10, + reserved = 0x20, + + any = 0x3f + }; + + LIBBUTL_SYMEXPORT bool + utf8 (const std::string&, + codepoint_types = codepoint_types::any, + const char32_t* whitelist = nullptr); + + codepoint_types operator& (codepoint_types, codepoint_types); + codepoint_types operator| (codepoint_types, codepoint_types); + codepoint_types operator&= (codepoint_types&, codepoint_types); + codepoint_types operator|= (codepoint_types&, codepoint_types); // If an input stream is in a failed state, then return true if this is // because of the eof and throw istream::failure otherwise. If the stream @@ -219,14 +255,12 @@ LIBBUTL_MODEXPORT namespace butl // // Note that on Windows setting an empty value usets the variable. // - LIBBUTL_SYMEXPORT - void + LIBBUTL_SYMEXPORT void setenv (const std::string& name, const std::string& value); // Throw system_error on failure. // - LIBBUTL_SYMEXPORT - void + LIBBUTL_SYMEXPORT void unsetenv (const std::string&); // Key comparators (i.e., to be used in sets, maps, etc). diff --git a/tests/utf8/buildfile b/tests/utf8/buildfile new file mode 100644 index 0000000..a2d10da --- /dev/null +++ b/tests/utf8/buildfile @@ -0,0 +1,7 @@ +# file : tests/utf8/buildfile +# copyright : Copyright (c) 2014-2019 Code Synthesis Ltd +# license : MIT; see accompanying LICENSE file + +import libs = libbutl%lib{butl} + +exe{driver}: {hxx cxx}{*} $libs diff --git a/tests/utf8/driver.cxx b/tests/utf8/driver.cxx new file mode 100644 index 0000000..06fb29f --- /dev/null +++ b/tests/utf8/driver.cxx @@ -0,0 +1,157 @@ +// file : tests/utf8/driver.cxx -*- C++ -*- +// copyright : Copyright (c) 2014-2019 Code Synthesis Ltd +// license : MIT; see accompanying LICENSE file + +#include + +#ifndef __cpp_lib_modules_ts +#include +#endif + +// Other includes. + +#ifdef __cpp_modules_ts +#ifdef __cpp_lib_modules_ts +import std.core; +#endif +import butl.utility; +#else +#include +#endif + +using namespace std; +using namespace butl; + +int +main () +{ + // Valid sequences. + // + // Empty. + // + assert (utf8 ("")); + + // 1 code point. + // + assert (utf8 ("a")); // 1 byte. + assert (utf8 ("\xD0\xB0")); // 2 bytes. + assert (utf8 ("\xE4\xBA\x8C")); // 3 bytes. + assert (utf8 ("\xF0\x90\x8C\x82")); // 4 bytes. + + // Multiple code points. + // + assert (utf8 ("a\xD0\xB0\xE4\xBA\x8C\xF0\x90\x8C\x82")); + + // Ill-formed sequences. + // + // 2-byte sequences. + // + assert (!utf8 ("\xC1\x80")); // Invalid first byte. + assert (!utf8 ("\xD0y")); // Invalid second byte. + + // 3-byte sequences. + // + assert (!utf8 ("\xE2\x70\x80")); // Invalid second byte. + assert (!utf8 ("\xE2\x80\x70")); // Invalid third byte. + + assert (!utf8 ("\xED\xA0\x80")); // Min UTF-16 surrogate value. + assert (!utf8 ("\xED\xBF\xBF")); // Max UTF-16 surrogate value. + + // 4-byte sequences. + // + assert (!utf8 ("\xF5\x80\x80\x80")); // Invalid first byte. + assert (!utf8 ("\xF0\x80\x80\x80")); // Invalid second byte. + assert (!utf8 ("\xF0\x90\x70\x80")); // Invalid third byte. + assert (!utf8 ("\xF1\x80\x80\xC0")); // Invalid forth byte. + + // Out of the codepoint range (0x10ffff + 1). + // + assert (!utf8 ("\xF4\x90\x80\x80")); + + // Incomplete sequences. + // + assert (!utf8 ("\xD0")); // 2-byte sequence. + assert (!utf8 ("\xE4\xBA")); // 3-byte sequence. + assert (!utf8 ("\xF0\x90\x8C")); // 4-byte sequence. + + // Missing sequence leading bytes. + // + assert (!utf8 ("\xB0xyz")); // 2-byte sequence. + assert (!utf8 ("\xBA\x8Cxyz")); // 3-byte sequence. + assert (!utf8 ("\x8Cxyz")); // 3-byte sequence. + assert (!utf8 ("\x90\x8C\x82xyz")); // 4-byte sequence. + assert (!utf8 ("\x8C\x82xyz")); // 4-byte sequence. + assert (!utf8 ("\x82xyz")); // 4-byte sequence. + + // Whitelisting. + // + assert (utf8 ("\r\t\n")); + + // Matched codepoint types. + // + // Control. + // + assert (utf8 ("\r", codepoint_types::control)); + assert (utf8 ("\x7F", codepoint_types::control)); + + // Non-character. + // + assert (utf8 ("\xF4\x8F\xBF\xBF", codepoint_types::non_character)); + assert (utf8 ("\xEF\xB7\x90", codepoint_types::non_character)); + + // Private-use. + // + assert (utf8 ("\xEE\x80\x80", codepoint_types::private_use)); + assert (utf8 ("\xF3\xB0\x80\x80", codepoint_types::private_use)); + + // Reserved. + // + assert (utf8 ("\xF3\xA1\x80\x80", codepoint_types::reserved)); + assert (utf8 ("\xF0\xB0\x80\x80", codepoint_types::reserved)); + assert (utf8 ("\xF3\xA0\x82\x80", codepoint_types::reserved)); + + // Format. + // + assert (utf8 ("\xC2\xAD", codepoint_types::format)); + assert (utf8 ("\xD8\x80", codepoint_types::format)); + assert (utf8 ("\xD8\x81", codepoint_types::format)); + assert (utf8 ("\xD8\x85", codepoint_types::format)); + assert (utf8 ("\xF3\xA0\x81\xBF", codepoint_types::format)); + + // Graphic. + // + assert (utf8 ("\xC2\xAC", codepoint_types::graphic)); + assert (utf8 ("\xC2\xAE", codepoint_types::graphic)); + assert (utf8 ("\xD8\x86", codepoint_types::graphic)); + assert (utf8 ("\xF3\xA0\x84\x80", codepoint_types::graphic)); + + // Private-use & graphic. + // + assert (utf8 ("\xEE\x80\x80\xF3\xB0\x80\x80\xC2\xAC", + codepoint_types::private_use | codepoint_types::graphic)); + + // None. + // + assert (utf8 ("\t", codepoint_types::none, U"\t")); // Whitelisted. + + // Any. + // + assert (utf8 ("\t")); + + // Unmatched codepoint types. + // + assert (!utf8 ("\x7F", codepoint_types::graphic, U"\t")); // Control. + assert (!utf8 ("\xEF\xB7\x90", codepoint_types::graphic)); // Non-char. + assert (!utf8 ("\xEE\x80\x80", codepoint_types::graphic)); // Private. + assert (!utf8 ("\xF3\xA1\x80\x80", codepoint_types::graphic)); // Reserved. + assert (!utf8 ("\xF3\xA0\x81\xBF", codepoint_types::graphic)); // Format. + + assert (!utf8 ("\xC2\xAC", codepoint_types::format)); // Graphic. + + // Private-use & Graphic. + // + assert (!utf8 ("\xEE\x80\x80\xF3\xB0\x80\x80\xC2\xAC", + codepoint_types::format)); + + assert (!utf8 ("a", codepoint_types::none)); // None. +} -- cgit v1.1