aboutsummaryrefslogtreecommitdiff
path: root/libbutl/utility.mxx
diff options
context:
space:
mode:
authorKaren Arutyunov <karen@codesynthesis.com>2019-12-18 22:32:16 +0300
committerKaren Arutyunov <karen@codesynthesis.com>2020-01-28 15:18:33 +0300
commitacd7a29c02e222e27d474fe3b64b1ca11b3506c0 (patch)
treecf26139e835e7caad479cfb52e4e6c0fc2c887e2 /libbutl/utility.mxx
parent5cd9e0c25e39ff1449e38a9c74e131e7359e7183 (diff)
Add utf8() predicate
Diffstat (limited to 'libbutl/utility.mxx')
-rw-r--r--libbutl/utility.mxx46
1 files changed, 40 insertions, 6 deletions
diff --git a/libbutl/utility.mxx b/libbutl/utility.mxx
index 11aa013..3bb335a 100644
--- a/libbutl/utility.mxx
+++ b/libbutl/utility.mxx
@@ -192,8 +192,44 @@ LIBBUTL_MODEXPORT namespace butl
// Note that it doesn't make sure the first character is not a digit.
//
std::string& sanitize_identifier (std::string&);
- std::string sanitize_identifier (std::string&&);
- std::string sanitize_identifier (const std::string&);
+ std::string sanitize_identifier (std::string&&);
+ std::string sanitize_identifier (const std::string&);
+
+ // Return true if the string is a valid UTF-8 encoded byte sequence and,
+ // optionally, its decoded codepoints belong to the specified types or to
+ // the codepoint whitelist ("\r\n\t" by default), unless it is NULL.
+ //
+ // Note that the Unicode Standard considers a UTF-8 byte sequence decoded
+ // into a codepoint of the surrogate type as invalid. Thus, the surrogate
+ // type may not be specified.
+ //
+ enum class codepoint_types: std::uint16_t
+ {
+ // Useful to only allow the whitelisted codepoints or when building the
+ // type set incrementally.
+ //
+ none = 0x00,
+
+ graphic = 0x01, // L(etter), M(ark), N(number), P(uncturation),
+ // S(symbol), Zs(separator, space)
+ format = 0x02,
+ control = 0x04,
+ private_use = 0x08,
+ non_character = 0x10,
+ reserved = 0x20,
+
+ any = 0x3f
+ };
+
+ LIBBUTL_SYMEXPORT bool
+ utf8 (const std::string&,
+ codepoint_types = codepoint_types::any,
+ const char32_t* whitelist = nullptr);
+
+ codepoint_types operator& (codepoint_types, codepoint_types);
+ codepoint_types operator| (codepoint_types, codepoint_types);
+ codepoint_types operator&= (codepoint_types&, codepoint_types);
+ codepoint_types operator|= (codepoint_types&, codepoint_types);
// If an input stream is in a failed state, then return true if this is
// because of the eof and throw istream::failure otherwise. If the stream
@@ -219,14 +255,12 @@ LIBBUTL_MODEXPORT namespace butl
//
// Note that on Windows setting an empty value usets the variable.
//
- LIBBUTL_SYMEXPORT
- void
+ LIBBUTL_SYMEXPORT void
setenv (const std::string& name, const std::string& value);
// Throw system_error on failure.
//
- LIBBUTL_SYMEXPORT
- void
+ LIBBUTL_SYMEXPORT void
unsetenv (const std::string&);
// Key comparators (i.e., to be used in sets, maps, etc).