aboutsummaryrefslogtreecommitdiff
path: root/libbutl/utility.ixx
diff options
context:
space:
mode:
authorKaren Arutyunov <karen@codesynthesis.com>2020-02-26 17:16:45 +0300
committerKaren Arutyunov <karen@codesynthesis.com>2020-02-26 17:17:49 +0300
commit5ae9686adac1508873f2d980e84becd3496244c2 (patch)
treed7c88e678b29ed6bb7ae30b74bd01aa2b5d2e9a8 /libbutl/utility.ixx
parentafb726d2d59b3715960a8647738860f40e37cf4f (diff)
Add notion of validator to char_scanner and make sure manifest is UTF-8
This involves implementing utf8_validator and UTF-8 utility functions and using them during the manifest parsing, serialization, and rewriting.
Diffstat (limited to 'libbutl/utility.ixx')
-rw-r--r--libbutl/utility.ixx95
1 files changed, 69 insertions, 26 deletions
diff --git a/libbutl/utility.ixx b/libbutl/utility.ixx
index c5fdbac..27ef7fb 100644
--- a/libbutl/utility.ixx
+++ b/libbutl/utility.ixx
@@ -2,8 +2,11 @@
// license : MIT; see accompanying LICENSE file
#ifndef __cpp_lib_modules_ts
-#include <cstdlib> // getenv()
-#include <algorithm>
+#include <cctype> // toupper(), tolower(), is*()
+#include <cwctype> // isw*()
+#include <cstdlib> // getenv()
+#include <algorithm> // for_each()
+#include <stdexcept> // invalid_argument
#endif
namespace butl
@@ -216,44 +219,84 @@ namespace butl
return sanitize_identifier (std::string (s));
}
- inline codepoint_types
- operator&= (codepoint_types& x, codepoint_types y)
+ inline bool
+ eof (std::istream& is)
{
- return x = static_cast<codepoint_types> (
- static_cast<std::uint16_t> (x) &
- static_cast<std::uint16_t> (y));
+ if (!is.fail ())
+ return false;
+
+ if (is.eof ())
+ return true;
+
+ throw std::istream::failure ("");
}
- inline codepoint_types
- operator|= (codepoint_types& x, codepoint_types y)
+ inline optional<std::size_t>
+ utf8_length_impl (const std::string& s,
+ std::string* what,
+ codepoint_types ts,
+ const char32_t* wl)
{
- return x = static_cast<codepoint_types> (
- static_cast<std::uint16_t> (x) |
- static_cast<std::uint16_t> (y));
+ using namespace std;
+
+ // Optimize for an empty string.
+ //
+ if (s.empty ())
+ return 0;
+
+ size_t r (0);
+ pair<bool, bool> v;
+ utf8_validator val (ts, wl);
+
+ for (char c: s)
+ {
+ v = val.validate (c, what);
+
+ if (!v.first) // Invalid byte?
+ return nullopt;
+
+ if (v.second) // Last byte in the sequence?
+ ++r;
+ }
+
+ // Make sure that the last UTF-8 sequence is complete.
+ //
+ if (!v.second)
+ {
+ if (what != nullptr)
+ *what = "incomplete UTF-8 sequence";
+
+ return nullopt;
+ }
+
+ return r;
}
- inline codepoint_types
- operator& (codepoint_types x, codepoint_types y)
+ inline std::size_t
+ utf8_length (const std::string& s, codepoint_types ts, const char32_t* wl)
{
- return x &= y;
+ using namespace std;
+
+ string what;
+ if (optional<size_t> r = utf8_length_impl (s, &what, ts, wl))
+ return *r;
+
+ throw invalid_argument (what);
}
- inline codepoint_types
- operator| (codepoint_types x, codepoint_types y)
+ inline bool
+ utf8 (const std::string& s,
+ std::string& what,
+ codepoint_types ts,
+ const char32_t* wl)
{
- return x |= y;
+ return utf8_length_impl (s, &what, ts, wl).has_value ();
}
inline bool
- eof (std::istream& is)
+ utf8 (const std::string& s, codepoint_types ts, const char32_t* wl)
{
- if (!is.fail ())
- return false;
-
- if (is.eof ())
- return true;
-
- throw std::istream::failure ("");
+ return utf8_length_impl (s, nullptr, ts, wl).has_value ();
}
inline optional<std::string>