aboutsummaryrefslogtreecommitdiff
path: root/libbutl/char-scanner.mxx
blob: 457435be3b4852679f5a51dd51e9ca1e31b2b89c (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
// file      : libbutl/char-scanner.mxx -*- C++ -*-
// copyright : Copyright (c) 2014-2019 Code Synthesis Ltd
// license   : MIT; see accompanying LICENSE file

#ifndef __cpp_modules
#pragma once
#endif

#include <cassert>

#ifndef __cpp_lib_modules
#include <string>  // char_traits
#include <cstdint> // uint64_t
#include <istream>
#endif

// Other includes.

#ifdef __cpp_modules
export module butl.char_scanner;
#ifdef __cpp_lib_modules
import std.core;
import std.io;
#endif
import butl.fdstream;
#else
#include <libbutl/fdstream.mxx>
#endif

#include <libbutl/export.hxx>

LIBBUTL_MODEXPORT namespace butl
{
  // Low-level character stream scanner. Normally used as a base for
  // higher-level lexers.
  //
  class LIBBUTL_SYMEXPORT char_scanner
  {
  public:
    // If the crlf argument is true, then recognize Windows newlines (0x0D
    // 0x0A) and convert them to just '\n' (0x0A). Note that a standalone
    // 0x0D is treated "as if" it was followed by 0x0A and multiple 0x0D
    // are treated as one.
    //
    // Note also that if the stream happens to be ifdstream, then it includes
    // a number of optimizations that assume nobody else is messing with the
    // stream.
    //
    // The line and position arguments can be used to override the start line
    // and position in the stream (useful when re-scanning data saved with the
    // save_* facility).
    //
    char_scanner (std::istream& is,
                  bool crlf = true,
                  std::uint64_t line = 1,
                  std::uint64_t position = 0);

    char_scanner (const char_scanner&) = delete;
    char_scanner& operator= (const char_scanner&) = delete;

    // Scanner interface.
    //
  public:

    // Extended character. It includes line/column/position information and is
    // capable of representing EOF.
    //
    // Note that implicit conversion of EOF to char_type results in NUL
    // character (which means in most cases it is safe to compare xchar to
    // char without checking for EOF).
    //
    class xchar
    {
    public:
      using traits_type = std::char_traits<char>;
      using int_type = traits_type::int_type;
      using char_type = traits_type::char_type;

      int_type value;
      std::uint64_t line;
      std::uint64_t column;

      // Logical character position (see ifdstream for details on the logical
      // part) if the scanned stream is ifdstream and always zero otherwise.
      //
      std::uint64_t position;

      operator char_type () const
      {
        return value != traits_type::eof ()
          ? static_cast<char_type> (value)
          : char_type (0);
      }

      xchar (int_type v,
             std::uint64_t l = 0,
             std::uint64_t c = 0,
             std::uint64_t p = 0)
          : value (v), line (l), column (c), position (p) {}
    };

    xchar
    get ();

    void
    get (const xchar& peeked); // Get previously peeked character (faster).

    void
    unget (const xchar&);

    // Note that if there is an "ungot" character, peek() will return
    // that.
    //
    xchar
    peek ();

    // Tests. In the future we can add tests line alpha(), alnum(),
    // etc.
    //
    static bool
    eos (const xchar& c) {return c.value == xchar::traits_type::eof ();}

    // Line, column and position of the next character to be extracted from
    // the stream by peek() or get().
    //
    std::uint64_t line;
    std::uint64_t column;
    std::uint64_t position;

    // Ability to save raw data as it is being scanned. Note that the
    // character is only saved when it is got, not peeked.
    //
  public:
    void
    save_start (std::string& b)
    {
      assert (save_ == nullptr);
      save_ = &b;
    }

    void
    save_stop ()
    {
      assert (save_ != nullptr);
      save_ = nullptr;
    }

    struct save_guard
    {
      explicit
      save_guard (char_scanner& s, std::string& b): s_ (&s) {s.save_start (b);}

      void
      stop () {if (s_ != nullptr) {s_->save_stop (); s_ = nullptr;}}

      ~save_guard () {stop ();}

    private:
      char_scanner* s_;
    };

  protected:
    using int_type = xchar::int_type;
    using char_type = xchar::char_type;

    int_type
    peek_ ();

    void
    get_ ();

    std::uint64_t
    pos_ () const;

  protected:
    std::istream& is_;

    // Note that if you are reading from the buffer directly, then it is
    // also your responsibility to save the data.
    //
    fdbuf* buf_; // NULL if not ifdstream.
    const char_type* gptr_;
    const char_type* egptr_;

    std::string* save_ = nullptr;

    bool crlf_;
    bool eos_ = false;

    bool unget_ = false;
    bool unpeek_ = false;

    xchar ungetc_ = '\0';
    xchar unpeekc_ = '\0';
  };
}

#include <libbutl/char-scanner.ixx>