aboutsummaryrefslogtreecommitdiff
path: root/libbutl/unicode.ixx
diff options
context:
space:
mode:
authorKaren Arutyunov <karen@codesynthesis.com>2020-02-26 17:16:45 +0300
committerKaren Arutyunov <karen@codesynthesis.com>2020-02-26 17:17:49 +0300
commit5ae9686adac1508873f2d980e84becd3496244c2 (patch)
treed7c88e678b29ed6bb7ae30b74bd01aa2b5d2e9a8 /libbutl/unicode.ixx
parentafb726d2d59b3715960a8647738860f40e37cf4f (diff)
Add notion of validator to char_scanner and make sure manifest is UTF-8
This involves implementing utf8_validator and UTF-8 utility functions and using them during the manifest parsing, serialization, and rewriting.
Diffstat (limited to 'libbutl/unicode.ixx')
-rw-r--r--libbutl/unicode.ixx72
1 files changed, 72 insertions, 0 deletions
diff --git a/libbutl/unicode.ixx b/libbutl/unicode.ixx
new file mode 100644
index 0000000..cba4fd2
--- /dev/null
+++ b/libbutl/unicode.ixx
@@ -0,0 +1,72 @@
+// file : libbutl/unicode.ixx -*- C++ -*-
+// license : MIT; see accompanying LICENSE file
+
+namespace butl
+{
+ inline codepoint_types
+ operator&= (codepoint_types& x, codepoint_types y)
+ {
+ return x = static_cast<codepoint_types> (
+ static_cast<std::uint16_t> (x) &
+ static_cast<std::uint16_t> (y));
+ }
+
+ inline codepoint_types
+ operator|= (codepoint_types& x, codepoint_types y)
+ {
+ return x = static_cast<codepoint_types> (
+ static_cast<std::uint16_t> (x) |
+ static_cast<std::uint16_t> (y));
+ }
+
+ inline codepoint_types
+ operator& (codepoint_types x, codepoint_types y)
+ {
+ return x &= y;
+ }
+
+ inline codepoint_types
+ operator| (codepoint_types x, codepoint_types y)
+ {
+ return x |= y;
+ }
+
+ LIBBUTL_SYMEXPORT codepoint_types
+ codepoint_type_lookup (char32_t);
+
+ inline codepoint_types
+ codepoint_type (char32_t c)
+ {
+ // Optimize for the common case (printable ASCII characters).
+ //
+ if (c >= 0x20 && c <= 0x7E) // Printable ASCII?
+ return codepoint_types::graphic;
+ else if (c > 0x10FFFF || (c >= 0xD800 && c <= 0xDFFF)) // Invalid?
+ return codepoint_types::none;
+ else if ((c & 0xFFFF) >= 0xFFFE) // Non-range based?
+ return codepoint_types::non_character;
+ else
+ return codepoint_type_lookup (c);
+ }
+
+ inline std::string
+ to_string (codepoint_types t)
+ {
+ // Note that we use the terms from the Unicode standard ("private-use"
+ // rather than "private use", "noncharacter" rather than "non-character").
+ //
+ switch (t)
+ {
+ case codepoint_types::graphic: return "graphic";
+ case codepoint_types::format: return "format";
+ case codepoint_types::control: return "control";
+ case codepoint_types::private_use: return "private-use";
+ case codepoint_types::non_character: return "noncharacter"; // No dash.
+ case codepoint_types::reserved: return "reserved";
+ case codepoint_types::none:
+ case codepoint_types::any: return "";
+ }
+
+ return ""; // Types combination.
+ }
+}