diff options
Diffstat (limited to 'libbuild2/lexer.cxx')
-rw-r--r-- | libbuild2/lexer.cxx | 720 |
1 files changed, 720 insertions, 0 deletions
diff --git a/libbuild2/lexer.cxx b/libbuild2/lexer.cxx new file mode 100644 index 0000000..fd13c31 --- /dev/null +++ b/libbuild2/lexer.cxx @@ -0,0 +1,720 @@ +// file : libbuild2/lexer.cxx -*- C++ -*- +// copyright : Copyright (c) 2014-2019 Code Synthesis Ltd +// license : MIT; see accompanying LICENSE file + +#include <libbuild2/lexer.hxx> + +#include <cstring> // strchr() + +using namespace std; + +namespace build2 +{ + using type = token_type; + + pair<char, bool> lexer:: + peek_char () + { + sep_ = skip_spaces (); + xchar c (peek ()); + return make_pair (eos (c) ? '\0' : char (c), sep_); + } + + void lexer:: + mode (lexer_mode m, char ps, optional<const char*> esc) + { + const char* s1 (nullptr); + const char* s2 (nullptr); + bool s (true); + bool n (true); + bool q (true); + + if (!esc) + { + assert (!state_.empty ()); + esc = state_.top ().escapes; + } + + switch (m) + { + case lexer_mode::normal: + { + s1 = ":<>=+ $(){}[]#\t\n"; + s2 = " = "; + break; + } + case lexer_mode::value: + { + s1 = " $(){}[]#\t\n"; + s2 = " "; + break; + } + case lexer_mode::attribute: + { + s1 = " $(]#\t\n"; + s2 = " "; + break; + } + case lexer_mode::eval: + { + s1 = ":<>=!&|?, $(){}[]#\t\n"; + s2 = " = &| "; + break; + } + case lexer_mode::buildspec: + { + // Like the value mode with these differences: + // + // 1. Returns '(' as a separated token provided the state stack depth + // is less than or equal to 3 (initial state plus two buildspec) + // (see parse_buildspec() for details). + // + // 2. Recognizes comma. + // + // 3. Treat newline as an ordinary space. + // + s1 = " $(){}[],\t\n"; + s2 = " "; + n = false; + break; + } + case lexer_mode::single_quoted: + case lexer_mode::double_quoted: + s = false; + // Fall through. + case lexer_mode::variable: + { + // These are handled in an ad hoc way in word(). + assert (ps == '\0'); + break; + } + default: assert (false); // Unhandled custom mode. + } + + state_.push (state {m, ps, s, n, q, *esc, s1, s2}); + } + + token lexer:: + next () + { + const state& st (state_.top ()); + lexer_mode m (st.mode); + + // For some modes we have dedicated imlementations of next(). + // + switch (m) + { + case lexer_mode::normal: + case lexer_mode::value: + case lexer_mode::attribute: + case lexer_mode::variable: + case lexer_mode::buildspec: break; + case lexer_mode::eval: return next_eval (); + case lexer_mode::double_quoted: return next_quoted (); + default: assert (false); // Unhandled custom mode. + } + + bool sep (skip_spaces ()); + + xchar c (get ()); + uint64_t ln (c.line), cn (c.column); + + auto make_token = [&sep, ln, cn] (type t, string v = string ()) + { + return token (t, move (v), + sep, quote_type::unquoted, false, + ln, cn, token_printer); + }; + + if (eos (c)) + return make_token (type::eos); + + // Handle pair separator. + // + if (c == st.sep_pair) + return make_token (type::pair_separator, string (1, c)); + + switch (c) + { + // NOTE: remember to update mode(), next_eval() if adding new special + // characters. + // + case '\n': + { + // Expire value mode at the end of the line. + // + if (m == lexer_mode::value) + state_.pop (); + + sep = true; // Treat newline as always separated. + return make_token (type::newline); + } + case '{': return make_token (type::lcbrace); + case '}': return make_token (type::rcbrace); + case '[': return make_token (type::lsbrace); + case ']': + { + // Expire attribute mode after closing ']'. + // + if (m == lexer_mode::attribute) + state_.pop (); + + return make_token (type::rsbrace); + } + case '$': return make_token (type::dollar); + case ')': return make_token (type::rparen); + case '(': + { + // Left paren is always separated in the buildspec mode. + // + if (m == lexer_mode::buildspec && state_.size () <= 3) + sep = true; + + return make_token (type::lparen); + } + } + + // The following characters are special in the normal and variable modes. + // + if (m == lexer_mode::normal || m == lexer_mode::variable) + { + switch (c) + { + // NOTE: remember to update mode(), next_eval() if adding new special + // characters. + // + case ':': return make_token (type::colon); + case '=': + { + if (peek () == '+') + { + get (); + return make_token (type::prepend); + } + else + return make_token (type::assign); + } + case '+': + { + if (peek () == '=') + { + get (); + return make_token (type::append); + } + } + } + } + + // The following characters are special in the normal mode. + // + if (m == lexer_mode::normal) + { + // NOTE: remember to update mode() if adding new special characters. + // + switch (c) + { + case '<': return make_token (type::labrace); + case '>': return make_token (type::rabrace); + } + } + + // The following characters are special in the buildspec mode. + // + if (m == lexer_mode::buildspec) + { + // NOTE: remember to update mode() if adding new special characters. + // + switch (c) + { + case ',': return make_token (type::comma); + } + } + + // Otherwise it is a word. + // + unget (c); + return word (st, sep); + } + + token lexer:: + next_eval () + { + bool sep (skip_spaces ()); + xchar c (get ()); + + if (eos (c)) + fail (c) << "unterminated evaluation context"; + + const state& st (state_.top ()); + + uint64_t ln (c.line), cn (c.column); + + auto make_token = [sep, ln, cn] (type t, string v = string ()) + { + return token (t, move (v), + sep, quote_type::unquoted, false, + ln, cn, token_printer); + }; + + // This mode is quite a bit like the value mode when it comes to special + // characters, except that we have some of our own. + // + + // Handle pair separator. + // + if (c == st.sep_pair) + return make_token (type::pair_separator, string (1, c)); + + // Note: we don't treat [ and ] as special here. Maybe can use them for + // something later. + // + switch (c) + { + // NOTE: remember to update mode() if adding new special characters. + // + case '\n': fail (c) << "newline in evaluation context" << endf; + case ':': return make_token (type::colon); + case '{': return make_token (type::lcbrace); + case '}': return make_token (type::rcbrace); + case '[': return make_token (type::lsbrace); + case ']': return make_token (type::rsbrace); + case '$': return make_token (type::dollar); + case '?': return make_token (type::question); + case ',': return make_token (type::comma); + case '(': return make_token (type::lparen); + case ')': + { + state_.pop (); // Expire eval mode. + return make_token (type::rparen); + } + // Potentially two-character tokens. + // + case '=': + case '!': + case '<': + case '>': + case '|': + case '&': + { + xchar p (peek ()); + + type r (type::eos); + switch (c) + { + case '|': if (p == '|') r = type::log_or; break; + case '&': if (p == '&') r = type::log_and; break; + + case '<': r = (p == '=' ? type::less_equal : type::less); break; + case '>': r = (p == '=' ? type::greater_equal : type::greater); break; + + case '=': if (p == '=') r = type::equal; break; + + case '!': r = (p == '=' ? type::not_equal : type::log_not); break; + } + + if (r == type::eos) + break; + + switch (r) + { + case type::less: + case type::greater: + case type::log_not: break; + default: get (); + } + + return make_token (r); + } + } + + // Otherwise it is a word. + // + unget (c); + return word (st, sep); + } + + token lexer:: + next_quoted () + { + xchar c (get ()); + + if (eos (c)) + fail (c) << "unterminated double-quoted sequence"; + + uint64_t ln (c.line), cn (c.column); + + auto make_token = [ln, cn] (type t) + { + return token (t, false, quote_type::double_, ln, cn, token_printer); + }; + + switch (c) + { + case '$': return make_token (type::dollar); + case '(': return make_token (type::lparen); + } + + // Otherwise it is a word. + // + unget (c); + return word (state_.top (), false); + } + + token lexer:: + word (state st, bool sep) + { + lexer_mode m (st.mode); + + xchar c (peek ()); + assert (!eos (c)); + + uint64_t ln (c.line), cn (c.column); + + string lexeme; + quote_type qtype (m == lexer_mode::double_quoted + ? quote_type::double_ + : quote_type::unquoted); + + // If we are already in the quoted mode then we didn't start with the + // quote character. + // + bool qcomp (false); + + auto append = [&lexeme, &m, &qcomp] (char c) + { + lexeme += c; + + // An unquoted character after a quoted fragment. + // + if (qcomp && m != lexer_mode::double_quoted) + qcomp = false; + }; + + for (; !eos (c); c = peek ()) + { + // First handle escape sequences. + // + if (c == '\\') + { + // In the variable mode we treat the beginning of the escape sequence + // as a separator (think \"$foo\"). + // + if (m == lexer_mode::variable) + break; + + get (); + xchar p (peek ()); + + const char* esc (st.escapes); + + if (esc == nullptr || + (*esc != '\0' && !eos (p) && strchr (esc, p) != nullptr)) + { + get (); + + if (eos (p)) + fail (p) << "unterminated escape sequence"; + + if (p != '\n') // Ignore if line continuation. + append (p); + + continue; + } + else + unget (c); // Treat as a normal character. + } + + bool done (false); + + // Next take care of the double-quoted mode. This one is tricky since + // we push/pop modes while accumulating the same lexeme for example: + // + // foo" bar "baz + // + if (m == lexer_mode::double_quoted) + { + switch (c) + { + // Only these two characters are special in the double-quoted mode. + // + case '$': + case '(': + { + done = true; + break; + } + // End quote. + // + case '\"': + { + get (); + state_.pop (); + + st = state_.top (); + m = st.mode; + continue; + } + } + } + // We also handle the variable mode in an ad hoc way. + // + else if (m == lexer_mode::variable) + { + if (c != '_' && !(lexeme.empty () ? alpha (c) : alnum (c))) + { + if (c != '.') + done = true; + else + { + // Normally '.' is part of the variable (namespace separator) + // unless it is trailing (think $major.$minor). + // + get (); + xchar p (peek ()); + done = eos (p) || !(alpha (p) || p == '_'); + unget (c); + } + } + } + else + { + // First check if it's a pair separator. + // + if (c == st.sep_pair) + done = true; + else + { + // Then see if this character or character sequence is a separator. + // + for (const char* p (strchr (st.sep_first, c)); + p != nullptr; + p = done ? nullptr : strchr (p + 1, c)) + { + char s (st.sep_second[p - st.sep_first]); + + // See if it has a second. + // + if (s != ' ') + { + get (); + done = (peek () == s); + unget (c); + } + else + done = true; + } + } + + // Handle single and double quotes if enabled for this mode and unless + // they were considered separators. + // + if (st.quotes && !done) + { + switch (c) + { + case '\'': + { + // Enter the single-quoted mode in case the derived lexer needs + // to notice this. + // + mode (lexer_mode::single_quoted); + + switch (qtype) + { + case quote_type::unquoted: + qtype = quote_type::single; + qcomp = lexeme.empty (); + break; + case quote_type::single: + qcomp = false; // Non-contiguous. + break; + case quote_type::double_: + qtype = quote_type::mixed; + // Fall through. + case quote_type::mixed: + qcomp = false; + break; + } + + get (); + for (c = get (); !eos (c) && c != '\''; c = get ()) + lexeme += c; + + if (eos (c)) + fail (c) << "unterminated single-quoted sequence"; + + state_.pop (); + continue; + } + case '\"': + { + get (); + + mode (lexer_mode::double_quoted); + st = state_.top (); + m = st.mode; + + switch (qtype) + { + case quote_type::unquoted: + qtype = quote_type::double_; + qcomp = lexeme.empty (); + break; + case quote_type::double_: + qcomp = false; // Non-contiguous. + break; + case quote_type::single: + qtype = quote_type::mixed; + // Fall through. + case quote_type::mixed: + qcomp = false; + break; + } + + continue; + } + } + } + } + + if (done) + break; + + get (); + append (c); + } + + if (m == lexer_mode::double_quoted) + { + if (eos (c)) + fail (c) << "unterminated double-quoted sequence"; + + // If we are still in the quoted mode then we didn't end with the quote + // character. + // + if (qcomp) + qcomp = false; + } + + // Expire variable mode at the end of the word. + // + if (m == lexer_mode::variable) + state_.pop (); + + return token (move (lexeme), sep, qtype, qcomp, ln, cn); + } + + bool lexer:: + skip_spaces () + { + bool r (sep_); + sep_ = false; + + const state& s (state_.top ()); + + // In some special modes we don't skip spaces. + // + if (!s.sep_space) + return r; + + xchar c (peek ()); + bool start (c.column == 1); + + for (; !eos (c); c = peek ()) + { + switch (c) + { + case ' ': + case '\t': + { + r = true; + break; + } + case '\n': + { + // In some modes we treat newlines as ordinary spaces. + // + if (!s.sep_newline) + { + r = true; + break; + } + + // Skip empty lines. + // + if (start) + { + r = false; + break; + } + + return r; + } + case '#': + { + r = true; + get (); + + // See if this is a multi-line comment in the form: + // + /* + #\ + ... + #\ + */ + auto ml = [&c, this] () -> bool + { + if ((c = peek ()) == '\\') + { + get (); + if ((c = peek ()) == '\n') + return true; + } + + return false; + }; + + if (ml ()) + { + // Scan until we see the closing one. + // + for (; !eos (c); c = peek ()) + { + get (); + if (c == '#' && ml ()) + break; + } + + if (eos (c)) + fail (c) << "unterminated multi-line comment"; + } + else + { + // Read until newline or eos. + // + for (; !eos (c) && c != '\n'; c = peek ()) + get (); + } + + continue; + } + case '\\': + { + get (); + + if (peek () == '\n') + break; // Ignore. + + unget (c); + } + // Fall through. + default: + return r; // Not a space. + } + + get (); + } + + return r; + } +} |