From 5ae9686adac1508873f2d980e84becd3496244c2 Mon Sep 17 00:00:00 2001 From: Karen Arutyunov Date: Wed, 26 Feb 2020 17:16:45 +0300 Subject: Add notion of validator to char_scanner and make sure manifest is UTF-8 This involves implementing utf8_validator and UTF-8 utility functions and using them during the manifest parsing, serialization, and rewriting. --- libbutl/utility.mxx | 70 ++++++++++++++++++++++++++++------------------------- 1 file changed, 37 insertions(+), 33 deletions(-) (limited to 'libbutl/utility.mxx') diff --git a/libbutl/utility.mxx b/libbutl/utility.mxx index 71c2860..b84e731 100644 --- a/libbutl/utility.mxx +++ b/libbutl/utility.mxx @@ -16,13 +16,10 @@ #include // ostream #include #include // size_t -#include // move(), forward() +#include // move(), forward(), pair #include // strcmp(), strlen() #include // exception, uncaught_exception[s]() //#include // hash - -#include // toupper(), tolower(), is*() -#include // isw*() #endif #include // thread_local @@ -34,8 +31,12 @@ export module butl.utility; import std.core; import std.io; #endif +import butl.utf8; +import butl.unicode; import butl.optional; #else +#include +#include #include #endif @@ -194,41 +195,44 @@ LIBBUTL_MODEXPORT namespace butl std::string sanitize_identifier (std::string&&); std::string sanitize_identifier (const std::string&); - // Return true if the string is a valid UTF-8 encoded byte sequence and, - // optionally, its decoded codepoints belong to the specified types or to - // the codepoint whitelist. - // - // Note that the Unicode Standard considers a UTF-8 byte sequence decoded - // into a codepoint of the surrogate type as invalid. Thus, the surrogate - // type may not be specified. + // Return true if the string is a valid UTF-8 encoded byte string and, + // optionally, its decoded codepoints belong to the specified types or + // codepoint whitelist. // - enum class codepoint_types: std::uint16_t - { - // Useful to only allow the whitelisted codepoints or when building the - // type set incrementally. - // - none = 0x00, - - graphic = 0x01, // L(etter), M(ark), N(number), P(uncturation), - // S(symbol), Zs(separator, space) - format = 0x02, - control = 0x04, - private_use = 0x08, - non_character = 0x10, - reserved = 0x20, - - any = 0x3f - }; + bool + utf8 (const std::string&, + codepoint_types = codepoint_types::any, + const char32_t* whitelist = nullptr); - LIBBUTL_SYMEXPORT bool + // As above but in case of an invalid sequence also return the description + // of why it is invalid. + // + bool utf8 (const std::string&, + std::string& what, codepoint_types = codepoint_types::any, const char32_t* whitelist = nullptr); - codepoint_types operator& (codepoint_types, codepoint_types); - codepoint_types operator| (codepoint_types, codepoint_types); - codepoint_types operator&= (codepoint_types&, codepoint_types); - codepoint_types operator|= (codepoint_types&, codepoint_types); + // Return UTF-8 byte string length in codepoints. Throw + // std::invalid_argument if this is not a valid UTF-8. + // + std::size_t + utf8_length (const std::string&, + codepoint_types = codepoint_types::any, + const char32_t* whitelist = nullptr); + + // Fixup the specified string (in place) to be valid UTF-8 replacing invalid + // bytes and codepoints with the specified character, for example, '?'. + // + // Potential future improvements: + // - char32_t replacement (will need UTF-8 encoding) + // - different replacement for bytes and codepoints + // + LIBBUTL_SYMEXPORT void + to_utf8 (std::string&, + char replacement, + codepoint_types = codepoint_types::any, + const char32_t* whitelist = nullptr); // If an input stream is in a failed state, then return true if this is // because of the eof and throw istream::failure otherwise. If the stream -- cgit v1.1