aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKaren Arutyunov <karen@codesynthesis.com>2019-12-18 22:32:16 +0300
committerKaren Arutyunov <karen@codesynthesis.com>2020-01-28 15:18:33 +0300
commitacd7a29c02e222e27d474fe3b64b1ca11b3506c0 (patch)
treecf26139e835e7caad479cfb52e4e6c0fc2c887e2
parent5cd9e0c25e39ff1449e38a9c74e131e7359e7183 (diff)
Add utf8() predicate
-rw-r--r--libbutl/utf8.cxx343
-rw-r--r--libbutl/utility.ixx28
-rw-r--r--libbutl/utility.mxx46
-rw-r--r--tests/utf8/buildfile7
-rw-r--r--tests/utf8/driver.cxx157
5 files changed, 575 insertions, 6 deletions
diff --git a/libbutl/utf8.cxx b/libbutl/utf8.cxx
new file mode 100644
index 0000000..e2a42bd
--- /dev/null
+++ b/libbutl/utf8.cxx
@@ -0,0 +1,343 @@
+// file : libbutl/utf8.cxx -*- C++ -*-
+// copyright : Copyright (c) 2014-2019 Code Synthesis Ltd
+// license : MIT; see accompanying LICENSE file
+
+#ifndef __cpp_modules_ts
+#include <libbutl/utility.mxx>
+#endif
+
+#ifndef __cpp_lib_modules_ts
+#include <string>
+#include <cstddef>
+
+#include <algorithm> // lower_bound()
+#endif
+
+#ifdef __cpp_modules_ts
+module butl.utility;
+
+// Only imports additional to interface.
+#ifdef __clang__
+#ifdef __cpp_lib_modules_ts
+import std.core;
+import std.io;
+#endif
+#endif
+
+#endif
+
+namespace butl
+{
+ using namespace std;
+
+ // Sorted arrays of the Unicode codepoint ranges corresponding to the
+ // codepoint types. Note that code type range lists (but not ranges
+ // themselves) may overlap.
+ //
+ // Note that the graphic type codepoints are numerous and scattered. Thus,
+ // we will consider a codepoint to be of the graphic type if it is not of
+ // any other type.
+ //
+ using codepoint_range = pair<char32_t, char32_t>;
+
+ static const codepoint_range cn_rs[] = // Control.
+ {
+ {0x00, 0x1F},
+ {0x7F, 0x9F}
+ };
+
+ static const codepoint_range fr_rs[] = // Format.
+ {
+ {0x000AD, 0x000AD},
+ {0x00600, 0x00605},
+ {0x0061C, 0x0061C},
+ {0x006DD, 0x006DD},
+ {0x0070F, 0x0070F},
+ {0x008E2, 0x008E2},
+ {0x0180E, 0x0180E},
+ {0x0200B, 0x0200F},
+ {0x0202A, 0x0202E},
+ {0x02060, 0x02064},
+ {0x02066, 0x0206F},
+ {0x0FEFF, 0x0FEFF},
+ {0x0FFF9, 0x0FFFB},
+ {0x110BD, 0x110BD},
+ {0x110CD, 0x110CD},
+ {0x13430, 0x13438},
+ {0x1BCA0, 0x1BCA3},
+ {0x1D173, 0x1D17A},
+ {0xE0001, 0xE0001},
+ {0xE0020, 0xE007F}
+ };
+
+ static const codepoint_range pr_rs[] = // Private-use.
+ {
+ {0x00E000, 0x00F8FF},
+ {0x0F0000, 0x10FFFF}
+ };
+
+ static const codepoint_range nc_rs[] = // Non-character.
+ {
+ {0xFDD0, 0xFDEF}
+ };
+
+ static const codepoint_range rs_rs[] = // Reserved.
+ {
+ {0x30000, 0xE0000},
+ {0xE0002, 0xE001F},
+ {0xE0080, 0xE00FF},
+ {0xE01F0, 0xEFFFF}
+ };
+
+ struct codepoint_type_ranges
+ {
+ codepoint_types type;
+ const codepoint_range* begin;
+ const codepoint_range* end;
+ };
+
+ static const codepoint_type_ranges ct_ranges[] =
+ {
+ {
+ codepoint_types::control,
+ cn_rs,
+ cn_rs + sizeof (cn_rs) / sizeof (*cn_rs)
+ },
+ {
+ codepoint_types::format,
+ fr_rs,
+ fr_rs + sizeof (fr_rs) / sizeof (*fr_rs)
+ },
+ {
+ codepoint_types::private_use,
+ pr_rs,
+ pr_rs + sizeof (pr_rs) / sizeof (*pr_rs)
+ },
+ {
+ codepoint_types::non_character,
+ nc_rs,
+ nc_rs + sizeof (nc_rs) / sizeof (*nc_rs)
+ },
+ {
+ codepoint_types::reserved,
+ rs_rs,
+ rs_rs + sizeof (rs_rs) / sizeof (*rs_rs)
+ }
+ };
+
+ bool
+ utf8 (const string& s, codepoint_types ts, const char32_t* wl)
+ {
+ // A UCS-4 character is encoded as the UTF-8 byte sequence as follows,
+ // depending on the value range it falls into:
+ //
+ // 0x00000000 - 0x0000007F:
+ // 0xxxxxxx
+ //
+ // 0x00000080 - 0x000007FF:
+ // 110xxxxx 10xxxxxx
+ //
+ // 0x00000800 - 0x0000FFFF:
+ // 1110xxxx 10xxxxxx 10xxxxxx
+ //
+ // 0x00010000 - 0x001FFFFF:
+ // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ //
+ // 0x00200000 - 0x03FFFFFF:
+ // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ //
+ // 0x04000000 - 0x7FFFFFFF:
+ // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ //
+ // Also note that the Unicode Standard (as of 12.1) specifies no
+ // codepoints above 0x10FFFF, so we consider 5- and 6-byte length UTF-8
+ // sequences as invalid (we could have added `unspecified` codepoint type
+ // except that there are no UTF-8 validation tables defined for these
+ // sequences).
+ //
+ size_t n (s.size ());
+
+ for (size_t i (0); i != n; )
+ {
+ // Detect the UTF-8 byte sequence length based on its first byte. While
+ // at it, start calculating the Unicode codepoint value.
+ //
+ size_t sn;
+ char32_t c;
+ unsigned char b1 (s[i]);
+
+ if (b1 < 0x80)
+ {
+ sn = 1;
+ c = b1;
+ }
+ else if (b1 < 0xE0)
+ {
+ sn = 2;
+ c = b1 & 0x1F; // Takes 5 rightmost bits of the first sequence byte.
+ }
+ else if (b1 < 0xF0)
+ {
+ sn = 3;
+ c = b1 & 0xF; // Takes 4 rightmost bits of the first sequence byte.
+ }
+ else if (b1 < 0xF8)
+ {
+ sn = 4;
+ c = b1 & 0x7; // Takes 3 rightmost bits of the first sequence byte.
+ }
+ else
+ return false; // The byte starts 5- or 6-byte length sequence.
+
+ // Bail out if the string doesn't contain all the requred codepoint
+ // encoding bytes.
+ //
+ if (sn > n - i)
+ return false;
+
+ // Note that while a codepoint may potentially be encoded with byte
+ // sequences of different lengths, only the shortest encoding sequence
+ // is considered well-formed. Also a well-formed sequence may not be
+ // decoded into a UTF-16 surrogate value ([D800 DFFF]) or a value that
+ // is greater than the max codepoint value (0x10FFFF). We will check all
+ // that using the Well-Formed UTF-8 Byte Sequences table (provided by
+ // the Unicode 12.0 Standard) which also takes care of the missing UTF-8
+ // sequence bytes.
+ //
+ // Return true if a byte value belongs to the specified range.
+ //
+ auto belongs = [] (unsigned char c, unsigned char l, unsigned char r)
+ {
+ return c >= l && c <= r;
+ };
+
+ switch (sn)
+ {
+ case 1: break; // Always well-formed by the definition (see above).
+ case 2:
+ {
+ // [000080 0007FF]: [C2 DF] [80 BF]
+ //
+ // Check the first/second bytes combinations:
+ //
+ if (!(belongs (b1, 0xC2, 0xDF) && belongs (s[i + 1], 0x80, 0xBF)))
+ return false;
+
+ break;
+ }
+ case 3:
+ {
+ // [000800 000FFF]: E0 [A0 BF] [80 BF]
+ // [001000 00CFFF]: [E1 EC] [80 BF] [80 BF]
+ // [00D000 00D7FF]: ED [80 9F] [80 BF] ; Excludes surrogates.
+ // [00E000 00FFFF]: [EE EF] [80 BF] [80 BF]
+ //
+ unsigned char b2 (s[i + 1]);
+
+ if (!((b1 == 0xE0 && belongs (b2, 0xA0, 0xBF)) ||
+ (belongs (b1, 0xE1, 0xEC) && belongs (b2, 0x80, 0xBF)) ||
+ (b1 == 0xED && belongs (b2, 0x80, 0x9F)) ||
+ (belongs (b1, 0xEE, 0xEF) && belongs (b2, 0x80, 0xBF))) ||
+ !belongs (s[i + 2], 0x80, 0xBF))
+ return false;
+
+ break;
+ }
+ case 4:
+ {
+ // [010000 03FFFF]: F0 [90 BF] [80 BF] [80 BF]
+ // [040000 0FFFFF]: [F1 F3] [80 BF] [80 BF] [80 BF]
+ // [100000 10FFFF]: F4 [80 8F] [80 BF] [80 BF]
+ //
+ unsigned char b2 (s[i + 1]);
+
+ if (!((b1 == 0xF0 && belongs (b2, 0x90, 0xBF)) ||
+ (belongs (b1, 0xF1, 0xF3) && belongs (b2, 0x80, 0xBF)) ||
+ (b1 == 0xF4 && belongs (b2, 0x80, 0x8F))) ||
+ !belongs (s[i + 2], 0x80, 0xBF) ||
+ !belongs (s[i + 3], 0x80, 0xBF))
+ return false;
+
+ break;
+ }
+ }
+
+ // For the remaining sequence bytes, "append" their 6 rightmost bits to
+ // the resulting codepoint value.
+ //
+ --sn;
+ ++i;
+
+ for (size_t n (i + sn); i != n; ++i)
+ c = (c << 6) | (s[i] & 0x3F);
+
+ // Check the decoded codepoint, unless any codepoint type is allowed.
+ //
+ if (ts == codepoint_types::any)
+ continue;
+
+ using traits = u32string::traits_type;
+
+ // Check if the decoded codepoint is whitelisted.
+ //
+ if (wl != nullptr &&
+ traits::find (wl, traits::length (wl), c) != nullptr)
+ continue;
+
+ // Match the decoded codepoint type against the specified type set.
+ //
+ // Detect the codepoint type (see the Types of Code Points table in the
+ // Unicode 12.0 Standard for details).
+ //
+ codepoint_types ct;
+
+ // Optimize for the common case (printable ASCII characters).
+ //
+ if (c >= 0x20 && c <= 0x7E)
+ ct = codepoint_types::graphic;
+ else if ((c & 0xFFFF) >= 0xFFFE) // Non-range based detection.
+ ct = codepoint_types::non_character;
+ else
+ {
+ // Note that we consider a codepoint to be of the graphic type if it
+ // is not of any other type (see above).
+ //
+ ct = codepoint_types::graphic;
+
+ // Note that the codepoint type range lists may overlap. Thus, we
+ // iterate over all of them until there is a match.
+ //
+ for (size_t i (0); i != sizeof (ct_ranges) / sizeof (*ct_ranges); ++i)
+ {
+ const codepoint_type_ranges& rs (ct_ranges[i]);
+
+ // Find the range that either contains the codepoint or lays to the
+ // right of it. Note that here we assume a range to be less than a
+ // codepoint if it lays to the left of the codepoint.
+ //
+ const codepoint_range* r (
+ lower_bound (rs.begin, rs.end,
+ c,
+ [] (const codepoint_range& r, char32_t c)
+ {
+ return r.second < c;
+ }));
+
+ if (r != rs.end && r->first <= c) // Contains the codepoint?
+ {
+ ct = rs.type;
+ break;
+ }
+ }
+ }
+
+ // Now check if the codepoint type matches the specified set. Note: also
+ // covers the `ts == codepoint_types::none` case.
+ //
+ if ((ct & ts) == codepoint_types::none)
+ return false;
+ }
+
+ return true;
+ }
+}
diff --git a/libbutl/utility.ixx b/libbutl/utility.ixx
index d8a5ee8..aabaef6 100644
--- a/libbutl/utility.ixx
+++ b/libbutl/utility.ixx
@@ -217,6 +217,34 @@ namespace butl
return sanitize_identifier (std::string (s));
}
+ inline codepoint_types
+ operator&= (codepoint_types& x, codepoint_types y)
+ {
+ return x = static_cast<codepoint_types> (
+ static_cast<std::uint16_t> (x) &
+ static_cast<std::uint16_t> (y));
+ }
+
+ inline codepoint_types
+ operator|= (codepoint_types& x, codepoint_types y)
+ {
+ return x = static_cast<codepoint_types> (
+ static_cast<std::uint16_t> (x) |
+ static_cast<std::uint16_t> (y));
+ }
+
+ inline codepoint_types
+ operator& (codepoint_types x, codepoint_types y)
+ {
+ return x &= y;
+ }
+
+ inline codepoint_types
+ operator| (codepoint_types x, codepoint_types y)
+ {
+ return x |= y;
+ }
+
inline bool
eof (std::istream& is)
{
diff --git a/libbutl/utility.mxx b/libbutl/utility.mxx
index 11aa013..3bb335a 100644
--- a/libbutl/utility.mxx
+++ b/libbutl/utility.mxx
@@ -192,8 +192,44 @@ LIBBUTL_MODEXPORT namespace butl
// Note that it doesn't make sure the first character is not a digit.
//
std::string& sanitize_identifier (std::string&);
- std::string sanitize_identifier (std::string&&);
- std::string sanitize_identifier (const std::string&);
+ std::string sanitize_identifier (std::string&&);
+ std::string sanitize_identifier (const std::string&);
+
+ // Return true if the string is a valid UTF-8 encoded byte sequence and,
+ // optionally, its decoded codepoints belong to the specified types or to
+ // the codepoint whitelist ("\r\n\t" by default), unless it is NULL.
+ //
+ // Note that the Unicode Standard considers a UTF-8 byte sequence decoded
+ // into a codepoint of the surrogate type as invalid. Thus, the surrogate
+ // type may not be specified.
+ //
+ enum class codepoint_types: std::uint16_t
+ {
+ // Useful to only allow the whitelisted codepoints or when building the
+ // type set incrementally.
+ //
+ none = 0x00,
+
+ graphic = 0x01, // L(etter), M(ark), N(number), P(uncturation),
+ // S(symbol), Zs(separator, space)
+ format = 0x02,
+ control = 0x04,
+ private_use = 0x08,
+ non_character = 0x10,
+ reserved = 0x20,
+
+ any = 0x3f
+ };
+
+ LIBBUTL_SYMEXPORT bool
+ utf8 (const std::string&,
+ codepoint_types = codepoint_types::any,
+ const char32_t* whitelist = nullptr);
+
+ codepoint_types operator& (codepoint_types, codepoint_types);
+ codepoint_types operator| (codepoint_types, codepoint_types);
+ codepoint_types operator&= (codepoint_types&, codepoint_types);
+ codepoint_types operator|= (codepoint_types&, codepoint_types);
// If an input stream is in a failed state, then return true if this is
// because of the eof and throw istream::failure otherwise. If the stream
@@ -219,14 +255,12 @@ LIBBUTL_MODEXPORT namespace butl
//
// Note that on Windows setting an empty value usets the variable.
//
- LIBBUTL_SYMEXPORT
- void
+ LIBBUTL_SYMEXPORT void
setenv (const std::string& name, const std::string& value);
// Throw system_error on failure.
//
- LIBBUTL_SYMEXPORT
- void
+ LIBBUTL_SYMEXPORT void
unsetenv (const std::string&);
// Key comparators (i.e., to be used in sets, maps, etc).
diff --git a/tests/utf8/buildfile b/tests/utf8/buildfile
new file mode 100644
index 0000000..a2d10da
--- /dev/null
+++ b/tests/utf8/buildfile
@@ -0,0 +1,7 @@
+# file : tests/utf8/buildfile
+# copyright : Copyright (c) 2014-2019 Code Synthesis Ltd
+# license : MIT; see accompanying LICENSE file
+
+import libs = libbutl%lib{butl}
+
+exe{driver}: {hxx cxx}{*} $libs
diff --git a/tests/utf8/driver.cxx b/tests/utf8/driver.cxx
new file mode 100644
index 0000000..06fb29f
--- /dev/null
+++ b/tests/utf8/driver.cxx
@@ -0,0 +1,157 @@
+// file : tests/utf8/driver.cxx -*- C++ -*-
+// copyright : Copyright (c) 2014-2019 Code Synthesis Ltd
+// license : MIT; see accompanying LICENSE file
+
+#include <cassert>
+
+#ifndef __cpp_lib_modules_ts
+#include <string>
+#endif
+
+// Other includes.
+
+#ifdef __cpp_modules_ts
+#ifdef __cpp_lib_modules_ts
+import std.core;
+#endif
+import butl.utility;
+#else
+#include <libbutl/utility.mxx>
+#endif
+
+using namespace std;
+using namespace butl;
+
+int
+main ()
+{
+ // Valid sequences.
+ //
+ // Empty.
+ //
+ assert (utf8 (""));
+
+ // 1 code point.
+ //
+ assert (utf8 ("a")); // 1 byte.
+ assert (utf8 ("\xD0\xB0")); // 2 bytes.
+ assert (utf8 ("\xE4\xBA\x8C")); // 3 bytes.
+ assert (utf8 ("\xF0\x90\x8C\x82")); // 4 bytes.
+
+ // Multiple code points.
+ //
+ assert (utf8 ("a\xD0\xB0\xE4\xBA\x8C\xF0\x90\x8C\x82"));
+
+ // Ill-formed sequences.
+ //
+ // 2-byte sequences.
+ //
+ assert (!utf8 ("\xC1\x80")); // Invalid first byte.
+ assert (!utf8 ("\xD0y")); // Invalid second byte.
+
+ // 3-byte sequences.
+ //
+ assert (!utf8 ("\xE2\x70\x80")); // Invalid second byte.
+ assert (!utf8 ("\xE2\x80\x70")); // Invalid third byte.
+
+ assert (!utf8 ("\xED\xA0\x80")); // Min UTF-16 surrogate value.
+ assert (!utf8 ("\xED\xBF\xBF")); // Max UTF-16 surrogate value.
+
+ // 4-byte sequences.
+ //
+ assert (!utf8 ("\xF5\x80\x80\x80")); // Invalid first byte.
+ assert (!utf8 ("\xF0\x80\x80\x80")); // Invalid second byte.
+ assert (!utf8 ("\xF0\x90\x70\x80")); // Invalid third byte.
+ assert (!utf8 ("\xF1\x80\x80\xC0")); // Invalid forth byte.
+
+ // Out of the codepoint range (0x10ffff + 1).
+ //
+ assert (!utf8 ("\xF4\x90\x80\x80"));
+
+ // Incomplete sequences.
+ //
+ assert (!utf8 ("\xD0")); // 2-byte sequence.
+ assert (!utf8 ("\xE4\xBA")); // 3-byte sequence.
+ assert (!utf8 ("\xF0\x90\x8C")); // 4-byte sequence.
+
+ // Missing sequence leading bytes.
+ //
+ assert (!utf8 ("\xB0xyz")); // 2-byte sequence.
+ assert (!utf8 ("\xBA\x8Cxyz")); // 3-byte sequence.
+ assert (!utf8 ("\x8Cxyz")); // 3-byte sequence.
+ assert (!utf8 ("\x90\x8C\x82xyz")); // 4-byte sequence.
+ assert (!utf8 ("\x8C\x82xyz")); // 4-byte sequence.
+ assert (!utf8 ("\x82xyz")); // 4-byte sequence.
+
+ // Whitelisting.
+ //
+ assert (utf8 ("\r\t\n"));
+
+ // Matched codepoint types.
+ //
+ // Control.
+ //
+ assert (utf8 ("\r", codepoint_types::control));
+ assert (utf8 ("\x7F", codepoint_types::control));
+
+ // Non-character.
+ //
+ assert (utf8 ("\xF4\x8F\xBF\xBF", codepoint_types::non_character));
+ assert (utf8 ("\xEF\xB7\x90", codepoint_types::non_character));
+
+ // Private-use.
+ //
+ assert (utf8 ("\xEE\x80\x80", codepoint_types::private_use));
+ assert (utf8 ("\xF3\xB0\x80\x80", codepoint_types::private_use));
+
+ // Reserved.
+ //
+ assert (utf8 ("\xF3\xA1\x80\x80", codepoint_types::reserved));
+ assert (utf8 ("\xF0\xB0\x80\x80", codepoint_types::reserved));
+ assert (utf8 ("\xF3\xA0\x82\x80", codepoint_types::reserved));
+
+ // Format.
+ //
+ assert (utf8 ("\xC2\xAD", codepoint_types::format));
+ assert (utf8 ("\xD8\x80", codepoint_types::format));
+ assert (utf8 ("\xD8\x81", codepoint_types::format));
+ assert (utf8 ("\xD8\x85", codepoint_types::format));
+ assert (utf8 ("\xF3\xA0\x81\xBF", codepoint_types::format));
+
+ // Graphic.
+ //
+ assert (utf8 ("\xC2\xAC", codepoint_types::graphic));
+ assert (utf8 ("\xC2\xAE", codepoint_types::graphic));
+ assert (utf8 ("\xD8\x86", codepoint_types::graphic));
+ assert (utf8 ("\xF3\xA0\x84\x80", codepoint_types::graphic));
+
+ // Private-use & graphic.
+ //
+ assert (utf8 ("\xEE\x80\x80\xF3\xB0\x80\x80\xC2\xAC",
+ codepoint_types::private_use | codepoint_types::graphic));
+
+ // None.
+ //
+ assert (utf8 ("\t", codepoint_types::none, U"\t")); // Whitelisted.
+
+ // Any.
+ //
+ assert (utf8 ("\t"));
+
+ // Unmatched codepoint types.
+ //
+ assert (!utf8 ("\x7F", codepoint_types::graphic, U"\t")); // Control.
+ assert (!utf8 ("\xEF\xB7\x90", codepoint_types::graphic)); // Non-char.
+ assert (!utf8 ("\xEE\x80\x80", codepoint_types::graphic)); // Private.
+ assert (!utf8 ("\xF3\xA1\x80\x80", codepoint_types::graphic)); // Reserved.
+ assert (!utf8 ("\xF3\xA0\x81\xBF", codepoint_types::graphic)); // Format.
+
+ assert (!utf8 ("\xC2\xAC", codepoint_types::format)); // Graphic.
+
+ // Private-use & Graphic.
+ //
+ assert (!utf8 ("\xEE\x80\x80\xF3\xB0\x80\x80\xC2\xAC",
+ codepoint_types::format));
+
+ assert (!utf8 ("a", codepoint_types::none)); // None.
+}