From df1ef68cd8e8582724ce1192bfc202e0b9aeaf0c Mon Sep 17 00:00:00 2001 From: Karen Arutyunov Date: Tue, 28 Sep 2021 19:24:31 +0300 Subject: Get rid of C++ modules related code and rename *.mxx files to *.hxx --- libbutl/utf8.hxx | 114 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 libbutl/utf8.hxx (limited to 'libbutl/utf8.hxx') diff --git a/libbutl/utf8.hxx b/libbutl/utf8.hxx new file mode 100644 index 0000000..697f77a --- /dev/null +++ b/libbutl/utf8.hxx @@ -0,0 +1,114 @@ +// file : libbutl/utf8.hxx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +#pragma once + +#include +#include // uint8_t +#include // pair + +#include + +#include + +namespace butl +{ + // Here and below we will refer to bytes that encode a singe Unicode + // codepoint as "UTF-8 byte sequence" ("UTF-8 sequence" or "byte sequence" + // for short) and a sequence of such sequences as "UTF-8 encoded byte + // string" ("byte string" for short). + // + + // Validate a UTF-8 encoded byte string one byte at a time. Optionally, also + // validate that its decoded codepoints belong to the specified types or + // codepoint whitelist. + // + class utf8_validator + { + public: + // Note: use whitelist via shallow copy. + // + explicit + utf8_validator (codepoint_types = codepoint_types::any, + const char32_t* whitelist = nullptr); + + // Validate the next byte returning true if it is valid (first) and + // whether it is the last byte of a codepoint (second). The {false, true} + // result indicates a byte sequence decoded into a codepoint of undesired + // type rather than an invalid byte that happens to be the last in the + // sequence (and may well be a valid starting byte of the next sequence). + // + // Note that in case the byte is invalid, calling this function again + // without recovery is illegal. + // + std::pair + validate (char); + + // As above but in case of an invalid byte also return the description of + // why it is invalid. + // + // Note that the description only contains the reason why the specified + // byte is not part of a valid UTF-8 sequence or the desired codepoint + // type, for example: + // + // "invalid UTF-8 sequence first byte (0xB0)" + // "invalid Unicode codepoint (reserved)" + // + // It can be used to form complete diagnostics along these lines: + // + // cerr << "invalid manifest value " << name << ": " << what << endl; + // + std::pair + validate (char, std::string& what); + + // As above but decide whether the description is needed at runtime (what + // may be NULL). + // + std::pair + validate (char, std::string* what); + + // Recover from an invalid byte. + // + // This function must be called with the first invalid and then subsequent + // bytes until it signals that the specified byte is valid. Note that it + // shall not be called if the sequence is decoded into a codepoint of an + // undesired type. + // + // Note also that a byte being invalid in the middle of a UTF-8 sequence + // may be valid as a first byte of the next sequence. + // + std::pair + recover (char); + + // Return the codepoint of the last byte sequence. + // + // This function can only be legally called after validate() or recover() + // signal that the preceding byte is valid and last. + // + char32_t + codepoint () const; + + private: + codepoint_types types_; + const char32_t* whitelist_; + + // State machine. + // + uint8_t seq_size_; // [1 4]; calculated at the first byte validation. + uint8_t seq_index_ = 0; // [0 3] + + // Last byte sequence decoded codepoint (built incrementally). + // + char32_t codepoint_; + + // The byte range a valid UTF-8 sequence second byte must belong to as + // calculated during the first byte validation. + // + // Note that the subsequent (third and forth) bytes must belong to the + // [80 BF] range regardless to the previous bytes. + // + std::pair byte2_range_; + }; +} + +#include -- cgit v1.1