From 5ae9686adac1508873f2d980e84becd3496244c2 Mon Sep 17 00:00:00 2001 From: Karen Arutyunov Date: Wed, 26 Feb 2020 17:16:45 +0300 Subject: Add notion of validator to char_scanner and make sure manifest is UTF-8 This involves implementing utf8_validator and UTF-8 utility functions and using them during the manifest parsing, serialization, and rewriting. --- libbutl/utility.ixx | 95 ++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 69 insertions(+), 26 deletions(-) (limited to 'libbutl/utility.ixx') diff --git a/libbutl/utility.ixx b/libbutl/utility.ixx index c5fdbac..27ef7fb 100644 --- a/libbutl/utility.ixx +++ b/libbutl/utility.ixx @@ -2,8 +2,11 @@ // license : MIT; see accompanying LICENSE file #ifndef __cpp_lib_modules_ts -#include // getenv() -#include +#include // toupper(), tolower(), is*() +#include // isw*() +#include // getenv() +#include // for_each() +#include // invalid_argument #endif namespace butl @@ -216,44 +219,84 @@ namespace butl return sanitize_identifier (std::string (s)); } - inline codepoint_types - operator&= (codepoint_types& x, codepoint_types y) + inline bool + eof (std::istream& is) { - return x = static_cast ( - static_cast (x) & - static_cast (y)); + if (!is.fail ()) + return false; + + if (is.eof ()) + return true; + + throw std::istream::failure (""); } - inline codepoint_types - operator|= (codepoint_types& x, codepoint_types y) + inline optional + utf8_length_impl (const std::string& s, + std::string* what, + codepoint_types ts, + const char32_t* wl) { - return x = static_cast ( - static_cast (x) | - static_cast (y)); + using namespace std; + + // Optimize for an empty string. + // + if (s.empty ()) + return 0; + + size_t r (0); + pair v; + utf8_validator val (ts, wl); + + for (char c: s) + { + v = val.validate (c, what); + + if (!v.first) // Invalid byte? + return nullopt; + + if (v.second) // Last byte in the sequence? + ++r; + } + + // Make sure that the last UTF-8 sequence is complete. + // + if (!v.second) + { + if (what != nullptr) + *what = "incomplete UTF-8 sequence"; + + return nullopt; + } + + return r; } - inline codepoint_types - operator& (codepoint_types x, codepoint_types y) + inline std::size_t + utf8_length (const std::string& s, codepoint_types ts, const char32_t* wl) { - return x &= y; + using namespace std; + + string what; + if (optional r = utf8_length_impl (s, &what, ts, wl)) + return *r; + + throw invalid_argument (what); } - inline codepoint_types - operator| (codepoint_types x, codepoint_types y) + inline bool + utf8 (const std::string& s, + std::string& what, + codepoint_types ts, + const char32_t* wl) { - return x |= y; + return utf8_length_impl (s, &what, ts, wl).has_value (); } inline bool - eof (std::istream& is) + utf8 (const std::string& s, codepoint_types ts, const char32_t* wl) { - if (!is.fail ()) - return false; - - if (is.eof ()) - return true; - - throw std::istream::failure (""); + return utf8_length_impl (s, nullptr, ts, wl).has_value (); } inline optional -- cgit v1.1