From 5ae9686adac1508873f2d980e84becd3496244c2 Mon Sep 17 00:00:00 2001 From: Karen Arutyunov Date: Wed, 26 Feb 2020 17:16:45 +0300 Subject: Add notion of validator to char_scanner and make sure manifest is UTF-8 This involves implementing utf8_validator and UTF-8 utility functions and using them during the manifest parsing, serialization, and rewriting. --- libbutl/unicode.ixx | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) create mode 100644 libbutl/unicode.ixx (limited to 'libbutl/unicode.ixx') diff --git a/libbutl/unicode.ixx b/libbutl/unicode.ixx new file mode 100644 index 0000000..cba4fd2 --- /dev/null +++ b/libbutl/unicode.ixx @@ -0,0 +1,72 @@ +// file : libbutl/unicode.ixx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +namespace butl +{ + inline codepoint_types + operator&= (codepoint_types& x, codepoint_types y) + { + return x = static_cast ( + static_cast (x) & + static_cast (y)); + } + + inline codepoint_types + operator|= (codepoint_types& x, codepoint_types y) + { + return x = static_cast ( + static_cast (x) | + static_cast (y)); + } + + inline codepoint_types + operator& (codepoint_types x, codepoint_types y) + { + return x &= y; + } + + inline codepoint_types + operator| (codepoint_types x, codepoint_types y) + { + return x |= y; + } + + LIBBUTL_SYMEXPORT codepoint_types + codepoint_type_lookup (char32_t); + + inline codepoint_types + codepoint_type (char32_t c) + { + // Optimize for the common case (printable ASCII characters). + // + if (c >= 0x20 && c <= 0x7E) // Printable ASCII? + return codepoint_types::graphic; + else if (c > 0x10FFFF || (c >= 0xD800 && c <= 0xDFFF)) // Invalid? + return codepoint_types::none; + else if ((c & 0xFFFF) >= 0xFFFE) // Non-range based? + return codepoint_types::non_character; + else + return codepoint_type_lookup (c); + } + + inline std::string + to_string (codepoint_types t) + { + // Note that we use the terms from the Unicode standard ("private-use" + // rather than "private use", "noncharacter" rather than "non-character"). + // + switch (t) + { + case codepoint_types::graphic: return "graphic"; + case codepoint_types::format: return "format"; + case codepoint_types::control: return "control"; + case codepoint_types::private_use: return "private-use"; + case codepoint_types::non_character: return "noncharacter"; // No dash. + case codepoint_types::reserved: return "reserved"; + case codepoint_types::none: + case codepoint_types::any: return ""; + } + + return ""; // Types combination. + } +} -- cgit v1.1