aboutsummaryrefslogtreecommitdiff
path: root/libbutl/utf8.mxx
diff options
context:
space:
mode:
authorKaren Arutyunov <karen@codesynthesis.com>2020-02-26 17:16:45 +0300
committerKaren Arutyunov <karen@codesynthesis.com>2020-02-26 17:17:49 +0300
commit5ae9686adac1508873f2d980e84becd3496244c2 (patch)
treed7c88e678b29ed6bb7ae30b74bd01aa2b5d2e9a8 /libbutl/utf8.mxx
parentafb726d2d59b3715960a8647738860f40e37cf4f (diff)
Add notion of validator to char_scanner and make sure manifest is UTF-8
This involves implementing utf8_validator and UTF-8 utility functions and using them during the manifest parsing, serialization, and rewriting.
Diffstat (limited to 'libbutl/utf8.mxx')
-rw-r--r--libbutl/utf8.mxx130
1 files changed, 130 insertions, 0 deletions
diff --git a/libbutl/utf8.mxx b/libbutl/utf8.mxx
new file mode 100644
index 0000000..15e8ded
--- /dev/null
+++ b/libbutl/utf8.mxx
@@ -0,0 +1,130 @@
+// file : libbutl/utf8.mxx -*- C++ -*-
+// license : MIT; see accompanying LICENSE file
+
+#ifndef __cpp_modules_ts
+#pragma once
+#endif
+
+// C includes.
+
+#ifndef __cpp_lib_modules_ts
+#include <string>
+#include <cstdint> // uint8_t
+#include <utility> // pair
+#endif
+
+// Other includes.
+
+#ifdef __cpp_modules_ts
+export module butl.utf8;
+#ifdef __cpp_lib_modules_ts
+import std.core;
+#endif
+import butl.unicode;
+#else
+#include <libbutl/unicode.mxx>
+#endif
+
+#include <libbutl/export.hxx>
+
+LIBBUTL_MODEXPORT namespace butl
+{
+ // Here and below we will refer to bytes that encode a singe Unicode
+ // codepoint as "UTF-8 byte sequence" ("UTF-8 sequence" or "byte sequence"
+ // for short) and a sequence of such sequences as "UTF-8 encoded byte
+ // string" ("byte string" for short).
+ //
+
+ // Validate a UTF-8 encoded byte string one byte at a time. Optionally, also
+ // validate that its decoded codepoints belong to the specified types or
+ // codepoint whitelist.
+ //
+ class utf8_validator
+ {
+ public:
+ // Note: use whitelist via shallow copy.
+ //
+ explicit
+ utf8_validator (codepoint_types = codepoint_types::any,
+ const char32_t* whitelist = nullptr);
+
+ // Validate the next byte returning true if it is valid (first) and
+ // whether it is the last byte of a codepoint (second). The {false, true}
+ // result indicates a byte sequence decoded into a codepoint of undesired
+ // type rather than an invalid byte that happens to be the last in the
+ // sequence (and may well be a valid starting byte of the next sequence).
+ //
+ // Note that in case the byte is invalid, calling this function again
+ // without recovery is illegal.
+ //
+ std::pair<bool, bool>
+ validate (char);
+
+ // As above but in case of an invalid byte also return the description of
+ // why it is invalid.
+ //
+ // Note that the description only contains the reason why the specified
+ // byte is not part of a valid UTF-8 sequence or the desired codepoint
+ // type, for example:
+ //
+ // "invalid UTF-8 sequence first byte (0xB0)"
+ // "invalid Unicode codepoint (reserved)"
+ //
+ // It can be used to form complete diagnostics along these lines:
+ //
+ // cerr << "invalid manifest value " << name << ": " << what << endl;
+ //
+ std::pair<bool, bool>
+ validate (char, std::string& what);
+
+ // As above but decide whether the description is needed at runtime (what
+ // may be NULL).
+ //
+ std::pair<bool, bool>
+ validate (char, std::string* what);
+
+ // Recover from an invalid byte.
+ //
+ // This function must be called with the first invalid and then subsequent
+ // bytes until it signals that the specified byte is valid. Note that it
+ // shall not be called if the sequence is decoded into a codepoint of an
+ // undesired type.
+ //
+ // Note also that a byte being invalid in the middle of a UTF-8 sequence
+ // may be valid as a first byte of the next sequence.
+ //
+ std::pair<bool, bool>
+ recover (char);
+
+ // Return the codepoint of the last byte sequence.
+ //
+ // This function can only be legally called after validate() or recover()
+ // signal that the preceding byte is valid and last.
+ //
+ char32_t
+ codepoint () const;
+
+ private:
+ codepoint_types types_;
+ const char32_t* whitelist_;
+
+ // State machine.
+ //
+ uint8_t seq_size_; // [1 4]; calculated at the first byte validation.
+ uint8_t seq_index_ = 0; // [0 3]
+
+ // Last byte sequence decoded codepoint (built incrementally).
+ //
+ char32_t codepoint_;
+
+ // The byte range a valid UTF-8 sequence second byte must belong to as
+ // calculated during the first byte validation.
+ //
+ // Note that the subsequent (third and forth) bytes must belong to the
+ // [80 BF] range regardless to the previous bytes.
+ //
+ std::pair<unsigned char, unsigned char> byte2_range_;
+ };
+}
+
+#include <libbutl/utf8.ixx>