aboutsummaryrefslogtreecommitdiff
path: root/libbutl/utility.mxx
diff options
context:
space:
mode:
authorKaren Arutyunov <karen@codesynthesis.com>2020-02-26 17:16:45 +0300
committerKaren Arutyunov <karen@codesynthesis.com>2020-02-26 17:17:49 +0300
commit5ae9686adac1508873f2d980e84becd3496244c2 (patch)
treed7c88e678b29ed6bb7ae30b74bd01aa2b5d2e9a8 /libbutl/utility.mxx
parentafb726d2d59b3715960a8647738860f40e37cf4f (diff)
Add notion of validator to char_scanner and make sure manifest is UTF-8
This involves implementing utf8_validator and UTF-8 utility functions and using them during the manifest parsing, serialization, and rewriting.
Diffstat (limited to 'libbutl/utility.mxx')
-rw-r--r--libbutl/utility.mxx70
1 files changed, 37 insertions, 33 deletions
diff --git a/libbutl/utility.mxx b/libbutl/utility.mxx
index 71c2860..b84e731 100644
--- a/libbutl/utility.mxx
+++ b/libbutl/utility.mxx
@@ -16,13 +16,10 @@
#include <iosfwd> // ostream
#include <istream>
#include <cstddef> // size_t
-#include <utility> // move(), forward()
+#include <utility> // move(), forward(), pair
#include <cstring> // strcmp(), strlen()
#include <exception> // exception, uncaught_exception[s]()
//#include <functional> // hash
-
-#include <cctype> // toupper(), tolower(), is*()
-#include <cwctype> // isw*()
#endif
#include <libbutl/ft/lang.hxx> // thread_local
@@ -34,8 +31,12 @@ export module butl.utility;
import std.core;
import std.io;
#endif
+import butl.utf8;
+import butl.unicode;
import butl.optional;
#else
+#include <libbutl/utf8.mxx>
+#include <libbutl/unicode.mxx>
#include <libbutl/optional.mxx>
#endif
@@ -194,41 +195,44 @@ LIBBUTL_MODEXPORT namespace butl
std::string sanitize_identifier (std::string&&);
std::string sanitize_identifier (const std::string&);
- // Return true if the string is a valid UTF-8 encoded byte sequence and,
- // optionally, its decoded codepoints belong to the specified types or to
- // the codepoint whitelist.
- //
- // Note that the Unicode Standard considers a UTF-8 byte sequence decoded
- // into a codepoint of the surrogate type as invalid. Thus, the surrogate
- // type may not be specified.
+ // Return true if the string is a valid UTF-8 encoded byte string and,
+ // optionally, its decoded codepoints belong to the specified types or
+ // codepoint whitelist.
//
- enum class codepoint_types: std::uint16_t
- {
- // Useful to only allow the whitelisted codepoints or when building the
- // type set incrementally.
- //
- none = 0x00,
-
- graphic = 0x01, // L(etter), M(ark), N(number), P(uncturation),
- // S(symbol), Zs(separator, space)
- format = 0x02,
- control = 0x04,
- private_use = 0x08,
- non_character = 0x10,
- reserved = 0x20,
-
- any = 0x3f
- };
+ bool
+ utf8 (const std::string&,
+ codepoint_types = codepoint_types::any,
+ const char32_t* whitelist = nullptr);
- LIBBUTL_SYMEXPORT bool
+ // As above but in case of an invalid sequence also return the description
+ // of why it is invalid.
+ //
+ bool
utf8 (const std::string&,
+ std::string& what,
codepoint_types = codepoint_types::any,
const char32_t* whitelist = nullptr);
- codepoint_types operator& (codepoint_types, codepoint_types);
- codepoint_types operator| (codepoint_types, codepoint_types);
- codepoint_types operator&= (codepoint_types&, codepoint_types);
- codepoint_types operator|= (codepoint_types&, codepoint_types);
+ // Return UTF-8 byte string length in codepoints. Throw
+ // std::invalid_argument if this is not a valid UTF-8.
+ //
+ std::size_t
+ utf8_length (const std::string&,
+ codepoint_types = codepoint_types::any,
+ const char32_t* whitelist = nullptr);
+
+ // Fixup the specified string (in place) to be valid UTF-8 replacing invalid
+ // bytes and codepoints with the specified character, for example, '?'.
+ //
+ // Potential future improvements:
+ // - char32_t replacement (will need UTF-8 encoding)
+ // - different replacement for bytes and codepoints
+ //
+ LIBBUTL_SYMEXPORT void
+ to_utf8 (std::string&,
+ char replacement,
+ codepoint_types = codepoint_types::any,
+ const char32_t* whitelist = nullptr);
// If an input stream is in a failed state, then return true if this is
// because of the eof and throw istream::failure otherwise. If the stream