Add notion of validator to char_scanner and make sure manifest is UTF-8

This involves implementing utf8_validator and UTF-8 utility functions and using them during the manifest parsing, serialization, and rewriting.
author: Karen Arutyunov <karen@codesynthesis.com> 2020-02-26 17:16:45 +0300
committer: Karen Arutyunov <karen@codesynthesis.com> 2020-02-26 17:17:49 +0300
commit: 5ae9686adac1508873f2d980e84becd3496244c2 (patch)
tree: d7c88e678b29ed6bb7ae30b74bd01aa2b5d2e9a8 /libbutl/utf8.mxx
parent: afb726d2d59b3715960a8647738860f40e37cf4f (diff)
1 files changed, 130 insertions, 0 deletions
diff --git a/libbutl/utf8.mxx b/libbutl/utf8.mxx
new file mode 100644
index 0000000..15e8ded
--- /dev/null
+++ b/libbutl/utf8.mxx
@@ -0,0 +1,130 @@
+// file      : libbutl/utf8.mxx -*- C++ -*-
+// license   : MIT; see accompanying LICENSE file
+
+#ifndef __cpp_modules_ts
+#pragma once
+#endif
+
+// C includes.
+
+#ifndef __cpp_lib_modules_ts
+#include <string>
+#include <cstdint> // uint8_t
+#include <utility> // pair
+#endif
+
+// Other includes.
+
+#ifdef __cpp_modules_ts
+export module butl.utf8;
+#ifdef __cpp_lib_modules_ts
+import std.core;
+#endif
+import butl.unicode;
+#else
+#include <libbutl/unicode.mxx>
+#endif
+
+#include <libbutl/export.hxx>
+
+LIBBUTL_MODEXPORT namespace butl
+{
+  // Here and below we will refer to bytes that encode a singe Unicode
+  // codepoint as "UTF-8 byte sequence" ("UTF-8 sequence" or "byte sequence"
+  // for short) and a sequence of such sequences as "UTF-8 encoded byte
+  // string" ("byte string" for short).
+  //
+
+  // Validate a UTF-8 encoded byte string one byte at a time. Optionally, also
+  // validate that its decoded codepoints belong to the specified types or
+  // codepoint whitelist.
+  //
+  class utf8_validator
+  {
+  public:
+    // Note: use whitelist via shallow copy.
+    //
+    explicit
+    utf8_validator (codepoint_types = codepoint_types::any,
+                    const char32_t* whitelist = nullptr);
+
+    // Validate the next byte returning true if it is valid (first) and
+    // whether it is the last byte of a codepoint (second). The {false, true}
+    // result indicates a byte sequence decoded into a codepoint of undesired
+    // type rather than an invalid byte that happens to be the last in the
+    // sequence (and may well be a valid starting byte of the next sequence).
+    //
+    // Note that in case the byte is invalid, calling this function again
+    // without recovery is illegal.
+    //
+    std::pair<bool, bool>
+    validate (char);
+
+    // As above but in case of an invalid byte also return the description of
+    // why it is invalid.
+    //
+    // Note that the description only contains the reason why the specified
+    // byte is not part of a valid UTF-8 sequence or the desired codepoint
+    // type, for example:
+    //
+    // "invalid UTF-8 sequence first byte (0xB0)"
+    // "invalid Unicode codepoint (reserved)"
+    //
+    // It can be used to form complete diagnostics along these lines:
+    //
+    // cerr << "invalid manifest value " << name << ": " << what << endl;
+    //
+    std::pair<bool, bool>
+    validate (char, std::string& what);
+
+    // As above but decide whether the description is needed at runtime (what
+    // may be NULL).
+    //
+    std::pair<bool, bool>
+    validate (char, std::string* what);
+
+    // Recover from an invalid byte.
+    //
+    // This function must be called with the first invalid and then subsequent
+    // bytes until it signals that the specified byte is valid. Note that it
+    // shall not be called if the sequence is decoded into a codepoint of an
+    // undesired type.
+    //
+    // Note also that a byte being invalid in the middle of a UTF-8 sequence
+    // may be valid as a first byte of the next sequence.
+    //
+    std::pair<bool, bool>
+    recover (char);
+
+    // Return the codepoint of the last byte sequence.
+    //
+    // This function can only be legally called after validate() or recover()
+    // signal that the preceding byte is valid and last.
+    //
+    char32_t
+    codepoint () const;
+
+  private:
+    codepoint_types types_;
+    const char32_t* whitelist_;
+
+    // State machine.
+    //
+    uint8_t seq_size_;      // [1 4]; calculated at the first byte validation.
+    uint8_t seq_index_ = 0; // [0 3]
+
+    // Last byte sequence decoded codepoint (built incrementally).
+    //
+    char32_t codepoint_;
+
+    // The byte range a valid UTF-8 sequence second byte must belong to as
+    // calculated during the first byte validation.
+    //
+    // Note that the subsequent (third and forth) bytes must belong to the
+    // [80 BF] range regardless to the previous bytes.
+    //
+    std::pair<unsigned char, unsigned char> byte2_range_;
+  };
+}
+
+#include <libbutl/utf8.ixx>
author	Karen Arutyunov <karen@codesynthesis.com>	2020-02-26 17:16:45 +0300
committer	Karen Arutyunov <karen@codesynthesis.com>	2020-02-26 17:17:49 +0300
commit	5ae9686adac1508873f2d980e84becd3496244c2 (patch)
tree	d7c88e678b29ed6bb7ae30b74bd01aa2b5d2e9a8 /libbutl/utf8.mxx
parent	afb726d2d59b3715960a8647738860f40e37cf4f (diff)