aboutsummaryrefslogtreecommitdiff
path: root/libbutl/utf8.mxx
blob: 15e8ded7e0ca12d878439a1c3d140725ddde674f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
// file      : libbutl/utf8.mxx -*- C++ -*-
// license   : MIT; see accompanying LICENSE file

#ifndef __cpp_modules_ts
#pragma once
#endif

// C includes.

#ifndef __cpp_lib_modules_ts
#include <string>
#include <cstdint> // uint8_t
#include <utility> // pair
#endif

// Other includes.

#ifdef __cpp_modules_ts
export module butl.utf8;
#ifdef __cpp_lib_modules_ts
import std.core;
#endif
import butl.unicode;
#else
#include <libbutl/unicode.mxx>
#endif

#include <libbutl/export.hxx>

LIBBUTL_MODEXPORT namespace butl
{
  // Here and below we will refer to bytes that encode a singe Unicode
  // codepoint as "UTF-8 byte sequence" ("UTF-8 sequence" or "byte sequence"
  // for short) and a sequence of such sequences as "UTF-8 encoded byte
  // string" ("byte string" for short).
  //

  // Validate a UTF-8 encoded byte string one byte at a time. Optionally, also
  // validate that its decoded codepoints belong to the specified types or
  // codepoint whitelist.
  //
  class utf8_validator
  {
  public:
    // Note: use whitelist via shallow copy.
    //
    explicit
    utf8_validator (codepoint_types = codepoint_types::any,
                    const char32_t* whitelist = nullptr);

    // Validate the next byte returning true if it is valid (first) and
    // whether it is the last byte of a codepoint (second). The {false, true}
    // result indicates a byte sequence decoded into a codepoint of undesired
    // type rather than an invalid byte that happens to be the last in the
    // sequence (and may well be a valid starting byte of the next sequence).
    //
    // Note that in case the byte is invalid, calling this function again
    // without recovery is illegal.
    //
    std::pair<bool, bool>
    validate (char);

    // As above but in case of an invalid byte also return the description of
    // why it is invalid.
    //
    // Note that the description only contains the reason why the specified
    // byte is not part of a valid UTF-8 sequence or the desired codepoint
    // type, for example:
    //
    // "invalid UTF-8 sequence first byte (0xB0)"
    // "invalid Unicode codepoint (reserved)"
    //
    // It can be used to form complete diagnostics along these lines:
    //
    // cerr << "invalid manifest value " << name << ": " << what << endl;
    //
    std::pair<bool, bool>
    validate (char, std::string& what);

    // As above but decide whether the description is needed at runtime (what
    // may be NULL).
    //
    std::pair<bool, bool>
    validate (char, std::string* what);

    // Recover from an invalid byte.
    //
    // This function must be called with the first invalid and then subsequent
    // bytes until it signals that the specified byte is valid. Note that it
    // shall not be called if the sequence is decoded into a codepoint of an
    // undesired type.
    //
    // Note also that a byte being invalid in the middle of a UTF-8 sequence
    // may be valid as a first byte of the next sequence.
    //
    std::pair<bool, bool>
    recover (char);

    // Return the codepoint of the last byte sequence.
    //
    // This function can only be legally called after validate() or recover()
    // signal that the preceding byte is valid and last.
    //
    char32_t
    codepoint () const;

  private:
    codepoint_types types_;
    const char32_t* whitelist_;

    // State machine.
    //
    uint8_t seq_size_;      // [1 4]; calculated at the first byte validation.
    uint8_t seq_index_ = 0; // [0 3]

    // Last byte sequence decoded codepoint (built incrementally).
    //
    char32_t codepoint_;

    // The byte range a valid UTF-8 sequence second byte must belong to as
    // calculated during the first byte validation.
    //
    // Note that the subsequent (third and forth) bytes must belong to the
    // [80 BF] range regardless to the previous bytes.
    //
    std::pair<unsigned char, unsigned char> byte2_range_;
  };
}

#include <libbutl/utf8.ixx>