From 5ae9686adac1508873f2d980e84becd3496244c2 Mon Sep 17 00:00:00 2001 From: Karen Arutyunov Date: Wed, 26 Feb 2020 17:16:45 +0300 Subject: Add notion of validator to char_scanner and make sure manifest is UTF-8 This involves implementing utf8_validator and UTF-8 utility functions and using them during the manifest parsing, serialization, and rewriting. --- libbutl/utf8.mxx | 130 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 libbutl/utf8.mxx (limited to 'libbutl/utf8.mxx') diff --git a/libbutl/utf8.mxx b/libbutl/utf8.mxx new file mode 100644 index 0000000..15e8ded --- /dev/null +++ b/libbutl/utf8.mxx @@ -0,0 +1,130 @@ +// file : libbutl/utf8.mxx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +#ifndef __cpp_modules_ts +#pragma once +#endif + +// C includes. + +#ifndef __cpp_lib_modules_ts +#include +#include // uint8_t +#include // pair +#endif + +// Other includes. + +#ifdef __cpp_modules_ts +export module butl.utf8; +#ifdef __cpp_lib_modules_ts +import std.core; +#endif +import butl.unicode; +#else +#include +#endif + +#include + +LIBBUTL_MODEXPORT namespace butl +{ + // Here and below we will refer to bytes that encode a singe Unicode + // codepoint as "UTF-8 byte sequence" ("UTF-8 sequence" or "byte sequence" + // for short) and a sequence of such sequences as "UTF-8 encoded byte + // string" ("byte string" for short). + // + + // Validate a UTF-8 encoded byte string one byte at a time. Optionally, also + // validate that its decoded codepoints belong to the specified types or + // codepoint whitelist. + // + class utf8_validator + { + public: + // Note: use whitelist via shallow copy. + // + explicit + utf8_validator (codepoint_types = codepoint_types::any, + const char32_t* whitelist = nullptr); + + // Validate the next byte returning true if it is valid (first) and + // whether it is the last byte of a codepoint (second). The {false, true} + // result indicates a byte sequence decoded into a codepoint of undesired + // type rather than an invalid byte that happens to be the last in the + // sequence (and may well be a valid starting byte of the next sequence). + // + // Note that in case the byte is invalid, calling this function again + // without recovery is illegal. + // + std::pair + validate (char); + + // As above but in case of an invalid byte also return the description of + // why it is invalid. + // + // Note that the description only contains the reason why the specified + // byte is not part of a valid UTF-8 sequence or the desired codepoint + // type, for example: + // + // "invalid UTF-8 sequence first byte (0xB0)" + // "invalid Unicode codepoint (reserved)" + // + // It can be used to form complete diagnostics along these lines: + // + // cerr << "invalid manifest value " << name << ": " << what << endl; + // + std::pair + validate (char, std::string& what); + + // As above but decide whether the description is needed at runtime (what + // may be NULL). + // + std::pair + validate (char, std::string* what); + + // Recover from an invalid byte. + // + // This function must be called with the first invalid and then subsequent + // bytes until it signals that the specified byte is valid. Note that it + // shall not be called if the sequence is decoded into a codepoint of an + // undesired type. + // + // Note also that a byte being invalid in the middle of a UTF-8 sequence + // may be valid as a first byte of the next sequence. + // + std::pair + recover (char); + + // Return the codepoint of the last byte sequence. + // + // This function can only be legally called after validate() or recover() + // signal that the preceding byte is valid and last. + // + char32_t + codepoint () const; + + private: + codepoint_types types_; + const char32_t* whitelist_; + + // State machine. + // + uint8_t seq_size_; // [1 4]; calculated at the first byte validation. + uint8_t seq_index_ = 0; // [0 3] + + // Last byte sequence decoded codepoint (built incrementally). + // + char32_t codepoint_; + + // The byte range a valid UTF-8 sequence second byte must belong to as + // calculated during the first byte validation. + // + // Note that the subsequent (third and forth) bytes must belong to the + // [80 BF] range regardless to the previous bytes. + // + std::pair byte2_range_; + }; +} + +#include -- cgit v1.1