From df1ef68cd8e8582724ce1192bfc202e0b9aeaf0c Mon Sep 17 00:00:00 2001 From: Karen Arutyunov Date: Tue, 28 Sep 2021 19:24:31 +0300 Subject: Get rid of C++ modules related code and rename *.mxx files to *.hxx --- libbutl/utf8.mxx | 130 ------------------------------------------------------- 1 file changed, 130 deletions(-) delete mode 100644 libbutl/utf8.mxx (limited to 'libbutl/utf8.mxx') diff --git a/libbutl/utf8.mxx b/libbutl/utf8.mxx deleted file mode 100644 index 15e8ded..0000000 --- a/libbutl/utf8.mxx +++ /dev/null @@ -1,130 +0,0 @@ -// file : libbutl/utf8.mxx -*- C++ -*- -// license : MIT; see accompanying LICENSE file - -#ifndef __cpp_modules_ts -#pragma once -#endif - -// C includes. - -#ifndef __cpp_lib_modules_ts -#include -#include // uint8_t -#include // pair -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -export module butl.utf8; -#ifdef __cpp_lib_modules_ts -import std.core; -#endif -import butl.unicode; -#else -#include -#endif - -#include - -LIBBUTL_MODEXPORT namespace butl -{ - // Here and below we will refer to bytes that encode a singe Unicode - // codepoint as "UTF-8 byte sequence" ("UTF-8 sequence" or "byte sequence" - // for short) and a sequence of such sequences as "UTF-8 encoded byte - // string" ("byte string" for short). - // - - // Validate a UTF-8 encoded byte string one byte at a time. Optionally, also - // validate that its decoded codepoints belong to the specified types or - // codepoint whitelist. - // - class utf8_validator - { - public: - // Note: use whitelist via shallow copy. - // - explicit - utf8_validator (codepoint_types = codepoint_types::any, - const char32_t* whitelist = nullptr); - - // Validate the next byte returning true if it is valid (first) and - // whether it is the last byte of a codepoint (second). The {false, true} - // result indicates a byte sequence decoded into a codepoint of undesired - // type rather than an invalid byte that happens to be the last in the - // sequence (and may well be a valid starting byte of the next sequence). - // - // Note that in case the byte is invalid, calling this function again - // without recovery is illegal. - // - std::pair - validate (char); - - // As above but in case of an invalid byte also return the description of - // why it is invalid. - // - // Note that the description only contains the reason why the specified - // byte is not part of a valid UTF-8 sequence or the desired codepoint - // type, for example: - // - // "invalid UTF-8 sequence first byte (0xB0)" - // "invalid Unicode codepoint (reserved)" - // - // It can be used to form complete diagnostics along these lines: - // - // cerr << "invalid manifest value " << name << ": " << what << endl; - // - std::pair - validate (char, std::string& what); - - // As above but decide whether the description is needed at runtime (what - // may be NULL). - // - std::pair - validate (char, std::string* what); - - // Recover from an invalid byte. - // - // This function must be called with the first invalid and then subsequent - // bytes until it signals that the specified byte is valid. Note that it - // shall not be called if the sequence is decoded into a codepoint of an - // undesired type. - // - // Note also that a byte being invalid in the middle of a UTF-8 sequence - // may be valid as a first byte of the next sequence. - // - std::pair - recover (char); - - // Return the codepoint of the last byte sequence. - // - // This function can only be legally called after validate() or recover() - // signal that the preceding byte is valid and last. - // - char32_t - codepoint () const; - - private: - codepoint_types types_; - const char32_t* whitelist_; - - // State machine. - // - uint8_t seq_size_; // [1 4]; calculated at the first byte validation. - uint8_t seq_index_ = 0; // [0 3] - - // Last byte sequence decoded codepoint (built incrementally). - // - char32_t codepoint_; - - // The byte range a valid UTF-8 sequence second byte must belong to as - // calculated during the first byte validation. - // - // Note that the subsequent (third and forth) bytes must belong to the - // [80 BF] range regardless to the previous bytes. - // - std::pair byte2_range_; - }; -} - -#include -- cgit v1.1