aboutsummaryrefslogtreecommitdiff
path: root/libbutl/char-scanner.mxx
blob: 60994cf761e5d80784b65f8c09c21c35aef5ccac (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
// file      : libbutl/char-scanner.mxx -*- C++ -*-
// license   : MIT; see accompanying LICENSE file

#ifndef __cpp_modules_ts
#pragma once
#endif

#include <cassert>

#ifndef __cpp_lib_modules_ts
#include <string>  // char_traits
#include <cstddef> // size_t
#include <cstdint> // uint64_t
#include <climits> // INT_*
#include <utility> // pair, make_pair()
#include <istream>
#endif

// Other includes.

#ifdef __cpp_modules_ts
export module butl.char_scanner;
#ifdef __cpp_lib_modules_ts
import std.core;
import std.io;
#endif
import butl.fdstream;
#else
#include <libbutl/fdstream.mxx>
#endif

#include <libbutl/export.hxx>

LIBBUTL_MODEXPORT namespace butl
{
  // Refer to utf8_validator for details.
  //
  struct noop_validator
  {
    std::pair<bool, bool>
    validate (char) {return std::make_pair (true, true);}

    std::pair<bool, bool>
    validate (char c, std::string&) {return validate (c);}
  };

  // Low-level character stream scanner. Normally used as a base for
  // higher-level lexers.
  //
  template <typename V = noop_validator, std::size_t N = 1>
  class char_scanner
  {
  public:
    using validator_type = V;
    static constexpr const std::size_t unget_depth = N;

    // If the crlf argument is true, then recognize Windows newlines (0x0D
    // 0x0A) and convert them to just '\n' (0x0A). Note that a standalone
    // 0x0D is treated "as if" it was followed by 0x0A and multiple 0x0D
    // are treated as one.
    //
    // Note also that if the stream happens to be ifdstream, then it includes
    // a number of optimizations that assume nobody else is messing with the
    // stream.
    //
    // The line and position arguments can be used to override the start line
    // and position in the stream (useful when re-scanning data saved with the
    // save_* facility).
    //
    char_scanner (std::istream&,
                  bool crlf = true,
                  std::uint64_t line = 1,
                  std::uint64_t position = 0);

    char_scanner (std::istream&,
                  validator_type,
                  bool crlf = true,
                  std::uint64_t line = 1,
                  std::uint64_t position = 0);

    char_scanner (const char_scanner&) = delete;
    char_scanner& operator= (const char_scanner&) = delete;

    // Scanner interface.
    //
  public:

    // Extended character. It includes line/column/position information and is
    // capable of representing EOF and invalid characters.
    //
    // Note that implicit conversion of EOF/invalid to char_type results in
    // NUL character (which means in most cases it is safe to compare xchar to
    // char without checking for EOF).
    //
    class xchar
    {
    public:
      using traits_type = std::char_traits<char>;
      using int_type = traits_type::int_type;
      using char_type = traits_type::char_type;

      int_type value;

      // Note that the column is of the codepoint this byte belongs to.
      //
      std::uint64_t line;
      std::uint64_t column;

      // Logical character position (see ifdstream for details on the logical
      // part) if the scanned stream is ifdstream and always zero otherwise.
      //
      std::uint64_t position;

      static int_type
      invalid () {return traits_type::eof () != INT_MIN ? INT_MIN : INT_MAX;}

      operator char_type () const
      {
        return value != traits_type::eof () && value != invalid ()
          ? static_cast<char_type> (value)
          : char_type (0);
      }

      xchar (int_type v = 0,
             std::uint64_t l = 0,
             std::uint64_t c = 0,
             std::uint64_t p = 0)
          : value (v), line (l), column (c), position (p) {}
    };

    // Note that if any of the get() or peek() functions return an invalid
    // character, then the scanning has failed and none of them should be
    // called again.

    xchar
    get ();

    // As above but in case of an invalid character also return the
    // description of why it is invalid.
    //
    xchar
    get (std::string& what);

    void
    get (const xchar& peeked); // Get previously peeked character (faster).

    void
    unget (const xchar&);

    // Note that if there is an "ungot" character, peek() will return that.
    //
    xchar
    peek ();

    // As above but in case of an invalid character also return the
    // description of why it is invalid.
    //
    xchar
    peek (std::string& what);

    // Tests. In the future we can add tests line alpha(), alnum(), etc.
    //
    static bool
    eos (const xchar& c) {return c.value == xchar::traits_type::eof ();}

    static bool
    invalid (const xchar& c) {return c.value == xchar::invalid ();}

    // Line, column and position of the next character to be extracted from
    // the stream by peek() or get().
    //
    std::uint64_t line;
    std::uint64_t column;
    std::uint64_t position;

    // Ability to save raw data as it is being scanned. Note that the
    // character is only saved when it is got, not peeked.
    //
  public:
    void
    save_start (std::string& b)
    {
      assert (save_ == nullptr);
      save_ = &b;
    }

    void
    save_stop ()
    {
      assert (save_ != nullptr);
      save_ = nullptr;
    }

    struct save_guard
    {
      explicit
      save_guard (char_scanner& s, std::string& b): s_ (&s) {s.save_start (b);}

      void
      stop () {if (s_ != nullptr) {s_->save_stop (); s_ = nullptr;}}

      ~save_guard () {stop ();}

    private:
      char_scanner* s_;
    };

  protected:
    using int_type  = typename xchar::int_type;
    using char_type = typename xchar::char_type;

    int_type
    peek_ ();

    void
    get_ ();

    std::uint64_t
    pos_ () const;

    xchar
    get (std::string* what);

    xchar
    peek (std::string* what);

  protected:
    std::istream& is_;

    validator_type val_;
    bool decoded_   = true;  // The peeked character is last byte of sequence.
    bool validated_ = false; // The peeked character has been validated.

    // Note that if you are reading from the buffer directly, then it is also
    // your responsibility to call the validator and save the data (see
    // save_*().
    //
    // Besides that, make sure that the peek() call preceding the scan is
    // followed by the get() call (see validated_, decoded_, and unpeek_ for
    // the hairy details; realistically, you would probably only direct-scan
    // ASCII fragments).
    //
    fdbuf* buf_; // NULL if not ifdstream.
    const char_type* gptr_;
    const char_type* egptr_;

    std::string* save_ = nullptr;

    bool crlf_;
    bool eos_ = false;

    std::size_t ungetn_ = 0;
    xchar ungetb_[N];

    bool unpeek_ = false;
    xchar unpeekc_ = '\0';
  };
}

#include <libbutl/char-scanner.ixx>
#include <libbutl/char-scanner.txx>