From acd7a29c02e222e27d474fe3b64b1ca11b3506c0 Mon Sep 17 00:00:00 2001 From: Karen Arutyunov Date: Wed, 18 Dec 2019 22:32:16 +0300 Subject: Add utf8() predicate --- libbutl/utf8.cxx | 343 ++++++++++++++++++++++++++++++++++++++++++++++++++++ libbutl/utility.ixx | 28 +++++ libbutl/utility.mxx | 46 ++++++- 3 files changed, 411 insertions(+), 6 deletions(-) create mode 100644 libbutl/utf8.cxx (limited to 'libbutl') diff --git a/libbutl/utf8.cxx b/libbutl/utf8.cxx new file mode 100644 index 0000000..e2a42bd --- /dev/null +++ b/libbutl/utf8.cxx @@ -0,0 +1,343 @@ +// file : libbutl/utf8.cxx -*- C++ -*- +// copyright : Copyright (c) 2014-2019 Code Synthesis Ltd +// license : MIT; see accompanying LICENSE file + +#ifndef __cpp_modules_ts +#include +#endif + +#ifndef __cpp_lib_modules_ts +#include +#include + +#include // lower_bound() +#endif + +#ifdef __cpp_modules_ts +module butl.utility; + +// Only imports additional to interface. +#ifdef __clang__ +#ifdef __cpp_lib_modules_ts +import std.core; +import std.io; +#endif +#endif + +#endif + +namespace butl +{ + using namespace std; + + // Sorted arrays of the Unicode codepoint ranges corresponding to the + // codepoint types. Note that code type range lists (but not ranges + // themselves) may overlap. + // + // Note that the graphic type codepoints are numerous and scattered. Thus, + // we will consider a codepoint to be of the graphic type if it is not of + // any other type. + // + using codepoint_range = pair; + + static const codepoint_range cn_rs[] = // Control. + { + {0x00, 0x1F}, + {0x7F, 0x9F} + }; + + static const codepoint_range fr_rs[] = // Format. + { + {0x000AD, 0x000AD}, + {0x00600, 0x00605}, + {0x0061C, 0x0061C}, + {0x006DD, 0x006DD}, + {0x0070F, 0x0070F}, + {0x008E2, 0x008E2}, + {0x0180E, 0x0180E}, + {0x0200B, 0x0200F}, + {0x0202A, 0x0202E}, + {0x02060, 0x02064}, + {0x02066, 0x0206F}, + {0x0FEFF, 0x0FEFF}, + {0x0FFF9, 0x0FFFB}, + {0x110BD, 0x110BD}, + {0x110CD, 0x110CD}, + {0x13430, 0x13438}, + {0x1BCA0, 0x1BCA3}, + {0x1D173, 0x1D17A}, + {0xE0001, 0xE0001}, + {0xE0020, 0xE007F} + }; + + static const codepoint_range pr_rs[] = // Private-use. + { + {0x00E000, 0x00F8FF}, + {0x0F0000, 0x10FFFF} + }; + + static const codepoint_range nc_rs[] = // Non-character. + { + {0xFDD0, 0xFDEF} + }; + + static const codepoint_range rs_rs[] = // Reserved. + { + {0x30000, 0xE0000}, + {0xE0002, 0xE001F}, + {0xE0080, 0xE00FF}, + {0xE01F0, 0xEFFFF} + }; + + struct codepoint_type_ranges + { + codepoint_types type; + const codepoint_range* begin; + const codepoint_range* end; + }; + + static const codepoint_type_ranges ct_ranges[] = + { + { + codepoint_types::control, + cn_rs, + cn_rs + sizeof (cn_rs) / sizeof (*cn_rs) + }, + { + codepoint_types::format, + fr_rs, + fr_rs + sizeof (fr_rs) / sizeof (*fr_rs) + }, + { + codepoint_types::private_use, + pr_rs, + pr_rs + sizeof (pr_rs) / sizeof (*pr_rs) + }, + { + codepoint_types::non_character, + nc_rs, + nc_rs + sizeof (nc_rs) / sizeof (*nc_rs) + }, + { + codepoint_types::reserved, + rs_rs, + rs_rs + sizeof (rs_rs) / sizeof (*rs_rs) + } + }; + + bool + utf8 (const string& s, codepoint_types ts, const char32_t* wl) + { + // A UCS-4 character is encoded as the UTF-8 byte sequence as follows, + // depending on the value range it falls into: + // + // 0x00000000 - 0x0000007F: + // 0xxxxxxx + // + // 0x00000080 - 0x000007FF: + // 110xxxxx 10xxxxxx + // + // 0x00000800 - 0x0000FFFF: + // 1110xxxx 10xxxxxx 10xxxxxx + // + // 0x00010000 - 0x001FFFFF: + // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + // + // 0x00200000 - 0x03FFFFFF: + // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + // + // 0x04000000 - 0x7FFFFFFF: + // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx + // + // Also note that the Unicode Standard (as of 12.1) specifies no + // codepoints above 0x10FFFF, so we consider 5- and 6-byte length UTF-8 + // sequences as invalid (we could have added `unspecified` codepoint type + // except that there are no UTF-8 validation tables defined for these + // sequences). + // + size_t n (s.size ()); + + for (size_t i (0); i != n; ) + { + // Detect the UTF-8 byte sequence length based on its first byte. While + // at it, start calculating the Unicode codepoint value. + // + size_t sn; + char32_t c; + unsigned char b1 (s[i]); + + if (b1 < 0x80) + { + sn = 1; + c = b1; + } + else if (b1 < 0xE0) + { + sn = 2; + c = b1 & 0x1F; // Takes 5 rightmost bits of the first sequence byte. + } + else if (b1 < 0xF0) + { + sn = 3; + c = b1 & 0xF; // Takes 4 rightmost bits of the first sequence byte. + } + else if (b1 < 0xF8) + { + sn = 4; + c = b1 & 0x7; // Takes 3 rightmost bits of the first sequence byte. + } + else + return false; // The byte starts 5- or 6-byte length sequence. + + // Bail out if the string doesn't contain all the requred codepoint + // encoding bytes. + // + if (sn > n - i) + return false; + + // Note that while a codepoint may potentially be encoded with byte + // sequences of different lengths, only the shortest encoding sequence + // is considered well-formed. Also a well-formed sequence may not be + // decoded into a UTF-16 surrogate value ([D800 DFFF]) or a value that + // is greater than the max codepoint value (0x10FFFF). We will check all + // that using the Well-Formed UTF-8 Byte Sequences table (provided by + // the Unicode 12.0 Standard) which also takes care of the missing UTF-8 + // sequence bytes. + // + // Return true if a byte value belongs to the specified range. + // + auto belongs = [] (unsigned char c, unsigned char l, unsigned char r) + { + return c >= l && c <= r; + }; + + switch (sn) + { + case 1: break; // Always well-formed by the definition (see above). + case 2: + { + // [000080 0007FF]: [C2 DF] [80 BF] + // + // Check the first/second bytes combinations: + // + if (!(belongs (b1, 0xC2, 0xDF) && belongs (s[i + 1], 0x80, 0xBF))) + return false; + + break; + } + case 3: + { + // [000800 000FFF]: E0 [A0 BF] [80 BF] + // [001000 00CFFF]: [E1 EC] [80 BF] [80 BF] + // [00D000 00D7FF]: ED [80 9F] [80 BF] ; Excludes surrogates. + // [00E000 00FFFF]: [EE EF] [80 BF] [80 BF] + // + unsigned char b2 (s[i + 1]); + + if (!((b1 == 0xE0 && belongs (b2, 0xA0, 0xBF)) || + (belongs (b1, 0xE1, 0xEC) && belongs (b2, 0x80, 0xBF)) || + (b1 == 0xED && belongs (b2, 0x80, 0x9F)) || + (belongs (b1, 0xEE, 0xEF) && belongs (b2, 0x80, 0xBF))) || + !belongs (s[i + 2], 0x80, 0xBF)) + return false; + + break; + } + case 4: + { + // [010000 03FFFF]: F0 [90 BF] [80 BF] [80 BF] + // [040000 0FFFFF]: [F1 F3] [80 BF] [80 BF] [80 BF] + // [100000 10FFFF]: F4 [80 8F] [80 BF] [80 BF] + // + unsigned char b2 (s[i + 1]); + + if (!((b1 == 0xF0 && belongs (b2, 0x90, 0xBF)) || + (belongs (b1, 0xF1, 0xF3) && belongs (b2, 0x80, 0xBF)) || + (b1 == 0xF4 && belongs (b2, 0x80, 0x8F))) || + !belongs (s[i + 2], 0x80, 0xBF) || + !belongs (s[i + 3], 0x80, 0xBF)) + return false; + + break; + } + } + + // For the remaining sequence bytes, "append" their 6 rightmost bits to + // the resulting codepoint value. + // + --sn; + ++i; + + for (size_t n (i + sn); i != n; ++i) + c = (c << 6) | (s[i] & 0x3F); + + // Check the decoded codepoint, unless any codepoint type is allowed. + // + if (ts == codepoint_types::any) + continue; + + using traits = u32string::traits_type; + + // Check if the decoded codepoint is whitelisted. + // + if (wl != nullptr && + traits::find (wl, traits::length (wl), c) != nullptr) + continue; + + // Match the decoded codepoint type against the specified type set. + // + // Detect the codepoint type (see the Types of Code Points table in the + // Unicode 12.0 Standard for details). + // + codepoint_types ct; + + // Optimize for the common case (printable ASCII characters). + // + if (c >= 0x20 && c <= 0x7E) + ct = codepoint_types::graphic; + else if ((c & 0xFFFF) >= 0xFFFE) // Non-range based detection. + ct = codepoint_types::non_character; + else + { + // Note that we consider a codepoint to be of the graphic type if it + // is not of any other type (see above). + // + ct = codepoint_types::graphic; + + // Note that the codepoint type range lists may overlap. Thus, we + // iterate over all of them until there is a match. + // + for (size_t i (0); i != sizeof (ct_ranges) / sizeof (*ct_ranges); ++i) + { + const codepoint_type_ranges& rs (ct_ranges[i]); + + // Find the range that either contains the codepoint or lays to the + // right of it. Note that here we assume a range to be less than a + // codepoint if it lays to the left of the codepoint. + // + const codepoint_range* r ( + lower_bound (rs.begin, rs.end, + c, + [] (const codepoint_range& r, char32_t c) + { + return r.second < c; + })); + + if (r != rs.end && r->first <= c) // Contains the codepoint? + { + ct = rs.type; + break; + } + } + } + + // Now check if the codepoint type matches the specified set. Note: also + // covers the `ts == codepoint_types::none` case. + // + if ((ct & ts) == codepoint_types::none) + return false; + } + + return true; + } +} diff --git a/libbutl/utility.ixx b/libbutl/utility.ixx index d8a5ee8..aabaef6 100644 --- a/libbutl/utility.ixx +++ b/libbutl/utility.ixx @@ -217,6 +217,34 @@ namespace butl return sanitize_identifier (std::string (s)); } + inline codepoint_types + operator&= (codepoint_types& x, codepoint_types y) + { + return x = static_cast ( + static_cast (x) & + static_cast (y)); + } + + inline codepoint_types + operator|= (codepoint_types& x, codepoint_types y) + { + return x = static_cast ( + static_cast (x) | + static_cast (y)); + } + + inline codepoint_types + operator& (codepoint_types x, codepoint_types y) + { + return x &= y; + } + + inline codepoint_types + operator| (codepoint_types x, codepoint_types y) + { + return x |= y; + } + inline bool eof (std::istream& is) { diff --git a/libbutl/utility.mxx b/libbutl/utility.mxx index 11aa013..3bb335a 100644 --- a/libbutl/utility.mxx +++ b/libbutl/utility.mxx @@ -192,8 +192,44 @@ LIBBUTL_MODEXPORT namespace butl // Note that it doesn't make sure the first character is not a digit. // std::string& sanitize_identifier (std::string&); - std::string sanitize_identifier (std::string&&); - std::string sanitize_identifier (const std::string&); + std::string sanitize_identifier (std::string&&); + std::string sanitize_identifier (const std::string&); + + // Return true if the string is a valid UTF-8 encoded byte sequence and, + // optionally, its decoded codepoints belong to the specified types or to + // the codepoint whitelist ("\r\n\t" by default), unless it is NULL. + // + // Note that the Unicode Standard considers a UTF-8 byte sequence decoded + // into a codepoint of the surrogate type as invalid. Thus, the surrogate + // type may not be specified. + // + enum class codepoint_types: std::uint16_t + { + // Useful to only allow the whitelisted codepoints or when building the + // type set incrementally. + // + none = 0x00, + + graphic = 0x01, // L(etter), M(ark), N(number), P(uncturation), + // S(symbol), Zs(separator, space) + format = 0x02, + control = 0x04, + private_use = 0x08, + non_character = 0x10, + reserved = 0x20, + + any = 0x3f + }; + + LIBBUTL_SYMEXPORT bool + utf8 (const std::string&, + codepoint_types = codepoint_types::any, + const char32_t* whitelist = nullptr); + + codepoint_types operator& (codepoint_types, codepoint_types); + codepoint_types operator| (codepoint_types, codepoint_types); + codepoint_types operator&= (codepoint_types&, codepoint_types); + codepoint_types operator|= (codepoint_types&, codepoint_types); // If an input stream is in a failed state, then return true if this is // because of the eof and throw istream::failure otherwise. If the stream @@ -219,14 +255,12 @@ LIBBUTL_MODEXPORT namespace butl // // Note that on Windows setting an empty value usets the variable. // - LIBBUTL_SYMEXPORT - void + LIBBUTL_SYMEXPORT void setenv (const std::string& name, const std::string& value); // Throw system_error on failure. // - LIBBUTL_SYMEXPORT - void + LIBBUTL_SYMEXPORT void unsetenv (const std::string&); // Key comparators (i.e., to be used in sets, maps, etc). -- cgit v1.1