libbuild2/lexer.hxx


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284

// file      : libbuild2/lexer.hxx -*- C++ -*-
// license   : MIT; see accompanying LICENSE file

#ifndef LIBBUILD2_LEXER_HXX
#define LIBBUILD2_LEXER_HXX

#include <stack>

#include <libbutl/utf8.mxx>
#include <libbutl/unicode.mxx>
#include <libbutl/char-scanner.mxx>

#include <libbuild2/types.hxx>
#include <libbuild2/utility.hxx>

#include <libbuild2/token.hxx>
#include <libbuild2/diagnostics.hxx>

#include <libbuild2/export.hxx>

namespace build2
{
  // Context-dependent lexing mode.
  //
  // Quoted modes are internal and should not be set explicitly. In the value
  // mode we don't treat certain characters (e.g., `+`, `=`) as special so
  // that we can use them in the variable values, e.g., `foo = g++`. In
  // contrast, in the variable mode, we restrict certain character (e.g., `/`)
  // from appearing in the name. The values mode is like value but recogizes
  // `,` as special (used in contexts where we need to list multiple
  // values). The attributes/attribute_value modes are like values where each
  // value is potentially a variable assignment; they don't treat `{` and `}`
  // as special (so we cannot have name groups in attributes) as well as
  // recognizes `=` and `]`. The subscript mode is like value but doesn't
  // treat `{` and `}` as special and recognizes `]`. The eval mode is used in
  // the evaluation context.
  //
  // A number of modes are "derived" from the value/values mode by recognizing
  // a few extra characters:
  //
  //   switch_expressions  values plus `:`
  //   case_patterns       values plus `|` and `:`
  //
  // Note that the normal, value/values and derived, as well as eval modes
  // split words separated by the pair character (to disable pairs one can
  // pass `\0` as a pair character).
  //
  // The normal mode recognizes `%` and `{{...` at the beginning of the line
  // as special. The cmdvar mode is like normal but does not treat these
  // character sequences as special.
  //
  // Finally, the foreign mode reads everything until encountering a line that
  // contains nothing (besides whitespaces) other than the closing multi-
  // curly-brace (`}}...`) (or eos) returning the contents as the word token
  // followed by the multi_rcbrace (or eos). In a way it is similar to the
  // single-quote mode. The number of closing braces to expect is passed as
  // mode data.
  //
  // The mode data is also used by a few other modes. The buildspec mode uses
  // it as a boolean value to decide whether to recognize newlines as tokens.
  // In the variable mode the mode data may be a pointer to a C string that
  // contains a list of special one-character variable names to recognize
  // (e.g., $<, $~, etc). Note that the parser has a special kludge to also
  // recognize them as $(<), etc.
  //
  // The alternative modes must be set manually. The value/values and derived
  // modes automatically expires after the end of the line. The attribute and
  // subscript modes expires after the closing `]`. The variable mode expires
  // after the word token. The eval mode expires after the closing `)`. And
  // the foreign mode expires after the closing braces.
  //
  // Note that normally it is only safe to switch mode when the current token
  // is not quoted (or, more generally, when you are not in the double-quoted
  // mode) unless the mode treats the double-quote as a separator (e.g.,
  // variable name mode). Failed that your mode (which now will be the top of
  // the mode stack) will prevent proper recognition of the closing quote.
  //
  // The `[` token is used for attributes (where it cuts across most of the
  // modes) as well as for value subscript (where it is only recognized after
  // expansions). It is handled with a flag. In the normal mode it is
  // automatically set at the beginning and after each newline. In all other
  // modes it must be explicitly set at points where attribute/subscript is
  // recognized. In all the cases it is automatically reset after lexing the
  // next token (whether `[` or not).

  // Extendable/inheritable enum-like class.
  //
  struct lexer_mode: lexer_mode_base
  {
    using base_type = lexer_mode_base;

    enum
    {
      normal = base_type::value_next,
      cmdvar,
      variable,
      value,
      values,
      case_patterns,
      switch_expressions,
      attributes,
      attribute_value,
      subscript,
      eval,
      single_quoted,
      double_quoted,
      foreign,
      buildspec,

      value_next
    };

    lexer_mode () = default;
    lexer_mode (value_type v): base_type (v) {}
    lexer_mode (base_type v): base_type (v) {}
  };

  class LIBBUILD2_SYMEXPORT lexer:
    public butl::char_scanner<butl::utf8_validator, 2>
  {
  public:
    // If escape is not NULL then only escape sequences with characters from
    // this string are considered "effective escapes" with all others passed
    // through as is. Note that neither the name nor escape arguments are
    // copied.
    //
    lexer (istream& is,
           const path_name& name,
           uint64_t line = 1, // Start line in the stream.
           const char* escapes = nullptr)
      : lexer (is, name, line, escapes, true /* set_mode */) {}

    const path_name&
    name () const {return name_;}

    // Note: sets mode for the next token. The second argument can be used to
    // specify the pair separator character (if the mode supports pairs). If
    // escapes is not specified, then inherit the current mode's (though a
    // mode can also override it).
    //
    virtual void
    mode (lexer_mode,
          char pair_separator = '\0',
          optional<const char*> escapes = nullopt,
          uintptr_t data = 0);

    // Enable `[` recognition for the next token.
    //
    void
    enable_lsbrace (bool unsep = false)
    {
      state_.top ().lsbrace = true;
      state_.top ().lsbrace_unsep = unsep;
    }

    // Expire the current mode early.
    //
    void
    expire_mode () {state_.pop ();}

    lexer_mode
    mode () const {return state_.top ().mode;}

    char
    pair_separator () const {return state_.top ().sep_pair;}

    // Scanner. Note that it is ok to call next() again after getting eos.
    //
    // If you extend the lexer and add a custom lexer mode, then you must
    // override next() and handle the custom mode there.
    //
    virtual token
    next ();

    // Peek at the first two characters of the next token(s). Return the
    // characters or '\0' if either would be eos. Also return an indicator of
    // whether the next token would be separated. Note: cannot be used to peek
    // at the first character of a line.
    //
    // Note also that it assumes that the current mode and the potential new
    // mode in which these characters will actually be parsed use the same
    // whitespace separation (the sep_space and sep_newline values).
    //
    pair<pair<char, char>, bool>
    peek_chars ();

  protected:
    struct state
    {
      lexer_mode      mode;
      uintptr_t       data;
      optional<token> hold;

      bool lsbrace;       // Recognize `[`.
      bool lsbrace_unsep; // Recognize it only if unseparated.

      char sep_pair;
      bool sep_space;    // Are whitespaces separators (see skip_spaces())?
      bool sep_newline;  // Is newline special (see skip_spaces())?
      bool quotes;       // Recognize quoted fragments.

      const char* escapes; // Effective escape sequences to recognize.

      // Word separator characters. For two-character sequence put the first
      // one in sep_first and the second one in the corresponding position of
      // sep_second. If it's a single-character sequence, then put space in
      // sep_second. If there are multiple sequences that start with the same
      // character, then repeat the first character in sep_first.
      //
      const char* sep_first;
      const char* sep_second;
    };

    token
    next_eval ();

    token
    next_quoted ();

    token
    next_foreign ();

    // Lex a word assuming current is the top state (which may already have
    // been "expired" from the top).
    //
    virtual token
    word (state current, bool separated);

    // Return true in first if we have seen any spaces. Skipped empty lines
    // don't count. In other words, we are only interested in spaces that are
    // on the same line as the following non-space character. Return true in
    // second if we have started skipping spaces from column 1 (note that
    // if this mode does not skip spaces, then second will always be false).
    //
    pair<bool, bool>
    skip_spaces ();

    // Diagnostics.
    //
  protected:
    fail_mark fail;

    // Lexer state.
    //
  protected:
    lexer (istream& is, const path_name& name, uint64_t line,
           const char* escapes,
           bool set_mode)
      : char_scanner (is,
                      butl::utf8_validator (butl::codepoint_types::graphic,
                                            U"\n\r\t"),
                      true /* crlf */,
                      line),
        fail ("error", &name),
        name_ (name),
        sep_ (false)
    {
      if (set_mode)
        mode (lexer_mode::normal, '@', escapes);
    }

    const path_name& name_;
    std::stack<state> state_;

    bool sep_; // True if we skipped spaces in peek().
  };
}

// Diagnostics plumbing.
//
namespace butl // ADL
{
  inline build2::location
  get_location (const butl::char_scanner<butl::utf8_validator, 2>::xchar& c,
                const void* data)
  {
    using namespace build2;

    assert (data != nullptr); // E.g., must be &lexer::name_.
    return location (*static_cast<const path_name*> (data), c.line, c.column);
  }
}

#endif // LIBBUILD2_LEXER_HXX