libbutl/utf8.hxx


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114

// file      : libbutl/utf8.hxx -*- C++ -*-
// license   : MIT; see accompanying LICENSE file

#pragma once

#include <string>
#include <cstdint> // uint8_t
#include <utility> // pair

#include <libbutl/unicode.hxx>

#include <libbutl/export.hxx>

namespace butl
{
  // Here and below we will refer to bytes that encode a singe Unicode
  // codepoint as "UTF-8 byte sequence" ("UTF-8 sequence" or "byte sequence"
  // for short) and a sequence of such sequences as "UTF-8 encoded byte
  // string" ("byte string" for short).
  //

  // Validate a UTF-8 encoded byte string one byte at a time. Optionally, also
  // validate that its decoded codepoints belong to the specified types or
  // codepoint whitelist.
  //
  class utf8_validator
  {
  public:
    // Note: use whitelist via shallow copy.
    //
    explicit
    utf8_validator (codepoint_types = codepoint_types::any,
                    const char32_t* whitelist = nullptr);

    // Validate the next byte returning true if it is valid (first) and
    // whether it is the last byte of a codepoint (second). The {false, true}
    // result indicates a byte sequence decoded into a codepoint of undesired
    // type rather than an invalid byte that happens to be the last in the
    // sequence (and may well be a valid starting byte of the next sequence).
    //
    // Note that in case the byte is invalid, calling this function again
    // without recovery is illegal.
    //
    std::pair<bool, bool>
    validate (char);

    // As above but in case of an invalid byte also return the description of
    // why it is invalid.
    //
    // Note that the description only contains the reason why the specified
    // byte is not part of a valid UTF-8 sequence or the desired codepoint
    // type, for example:
    //
    // "invalid UTF-8 sequence first byte (0xB0)"
    // "invalid Unicode codepoint (reserved)"
    //
    // It can be used to form complete diagnostics along these lines:
    //
    // cerr << "invalid manifest value " << name << ": " << what << endl;
    //
    std::pair<bool, bool>
    validate (char, std::string& what);

    // As above but decide whether the description is needed at runtime (what
    // may be NULL).
    //
    std::pair<bool, bool>
    validate (char, std::string* what);

    // Recover from an invalid byte.
    //
    // This function must be called with the first invalid and then subsequent
    // bytes until it signals that the specified byte is valid. Note that it
    // shall not be called if the sequence is decoded into a codepoint of an
    // undesired type.
    //
    // Note also that a byte being invalid in the middle of a UTF-8 sequence
    // may be valid as a first byte of the next sequence.
    //
    std::pair<bool, bool>
    recover (char);

    // Return the codepoint of the last byte sequence.
    //
    // This function can only be legally called after validate() or recover()
    // signal that the preceding byte is valid and last.
    //
    char32_t
    codepoint () const;

  private:
    codepoint_types types_;
    const char32_t* whitelist_;

    // State machine.
    //
    uint8_t seq_size_;      // [1 4]; calculated at the first byte validation.
    uint8_t seq_index_ = 0; // [0 3]

    // Last byte sequence decoded codepoint (built incrementally).
    //
    char32_t codepoint_;

    // The byte range a valid UTF-8 sequence second byte must belong to as
    // calculated during the first byte validation.
    //
    // Note that the subsequent (third and forth) bytes must belong to the
    // [80 BF] range regardless to the previous bytes.
    //
    std::pair<unsigned char, unsigned char> byte2_range_;
  };
}

#include <libbutl/utf8.ixx>