From 5ae9686adac1508873f2d980e84becd3496244c2 Mon Sep 17 00:00:00 2001
From: Karen Arutyunov <karen@codesynthesis.com>
Date: Wed, 26 Feb 2020 17:16:45 +0300
Subject: Add notion of validator to char_scanner and make sure manifest is
 UTF-8

This involves implementing utf8_validator and UTF-8 utility functions and
using them during the manifest parsing, serialization, and rewriting.
---
 libbutl/utf8.mxx | 130 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 130 insertions(+)
 create mode 100644 libbutl/utf8.mxx

(limited to 'libbutl/utf8.mxx')
diff --git a/libbutl/utf8.mxx b/libbutl/utf8.mxx
new file mode 100644
index 0000000..15e8ded
--- /dev/null
+++ b/libbutl/utf8.mxx
@@ -0,0 +1,130 @@
+// file      : libbutl/utf8.mxx -*- C++ -*-
+// license   : MIT; see accompanying LICENSE file
+
+#ifndef __cpp_modules_ts
+#pragma once
+#endif
+
+// C includes.
+
+#ifndef __cpp_lib_modules_ts
+#include <string>
+#include <cstdint> // uint8_t
+#include <utility> // pair
+#endif
+
+// Other includes.
+
+#ifdef __cpp_modules_ts
+export module butl.utf8;
+#ifdef __cpp_lib_modules_ts
+import std.core;
+#endif
+import butl.unicode;
+#else
+#include <libbutl/unicode.mxx>
+#endif
+
+#include <libbutl/export.hxx>
+
+LIBBUTL_MODEXPORT namespace butl
+{
+  // Here and below we will refer to bytes that encode a singe Unicode
+  // codepoint as "UTF-8 byte sequence" ("UTF-8 sequence" or "byte sequence"
+  // for short) and a sequence of such sequences as "UTF-8 encoded byte
+  // string" ("byte string" for short).
+  //
+
+  // Validate a UTF-8 encoded byte string one byte at a time. Optionally, also
+  // validate that its decoded codepoints belong to the specified types or
+  // codepoint whitelist.
+  //
+  class utf8_validator
+  {
+  public:
+    // Note: use whitelist via shallow copy.
+    //
+    explicit
+    utf8_validator (codepoint_types = codepoint_types::any,
+                    const char32_t* whitelist = nullptr);
+
+    // Validate the next byte returning true if it is valid (first) and
+    // whether it is the last byte of a codepoint (second). The {false, true}
+    // result indicates a byte sequence decoded into a codepoint of undesired
+    // type rather than an invalid byte that happens to be the last in the
+    // sequence (and may well be a valid starting byte of the next sequence).
+    //
+    // Note that in case the byte is invalid, calling this function again
+    // without recovery is illegal.
+    //
+    std::pair<bool, bool>
+    validate (char);
+
+    // As above but in case of an invalid byte also return the description of
+    // why it is invalid.
+    //
+    // Note that the description only contains the reason why the specified
+    // byte is not part of a valid UTF-8 sequence or the desired codepoint
+    // type, for example:
+    //
+    // "invalid UTF-8 sequence first byte (0xB0)"
+    // "invalid Unicode codepoint (reserved)"
+    //
+    // It can be used to form complete diagnostics along these lines:
+    //
+    // cerr << "invalid manifest value " << name << ": " << what << endl;
+    //
+    std::pair<bool, bool>
+    validate (char, std::string& what);
+
+    // As above but decide whether the description is needed at runtime (what
+    // may be NULL).
+    //
+    std::pair<bool, bool>
+    validate (char, std::string* what);
+
+    // Recover from an invalid byte.
+    //
+    // This function must be called with the first invalid and then subsequent
+    // bytes until it signals that the specified byte is valid. Note that it
+    // shall not be called if the sequence is decoded into a codepoint of an
+    // undesired type.
+    //
+    // Note also that a byte being invalid in the middle of a UTF-8 sequence
+    // may be valid as a first byte of the next sequence.
+    //
+    std::pair<bool, bool>
+    recover (char);
+
+    // Return the codepoint of the last byte sequence.
+    //
+    // This function can only be legally called after validate() or recover()
+    // signal that the preceding byte is valid and last.
+    //
+    char32_t
+    codepoint () const;
+
+  private:
+    codepoint_types types_;
+    const char32_t* whitelist_;
+
+    // State machine.
+    //
+    uint8_t seq_size_;      // [1 4]; calculated at the first byte validation.
+    uint8_t seq_index_ = 0; // [0 3]
+
+    // Last byte sequence decoded codepoint (built incrementally).
+    //
+    char32_t codepoint_;
+
+    // The byte range a valid UTF-8 sequence second byte must belong to as
+    // calculated during the first byte validation.
+    //
+    // Note that the subsequent (third and forth) bytes must belong to the
+    // [80 BF] range regardless to the previous bytes.
+    //
+    std::pair<unsigned char, unsigned char> byte2_range_;
+  };
+}
+
+#include <libbutl/utf8.ixx>
-- 
cgit v1.1