// file      : libbuild2/lexer.cxx -*- C++ -*-
// copyright : Copyright (c) 2014-2019 Code Synthesis Ltd
// license   : MIT; see accompanying LICENSE file

#include <libbuild2/lexer.hxx>

#include <cstring> // strchr()

using namespace std;

namespace build2
{
  using type = token_type;

  pair<char, bool> lexer::
  peek_char ()
  {
    sep_ = skip_spaces ();
    xchar c (peek ());
    return make_pair (eos (c) ? '\0' : char (c), sep_);
  }

  void lexer::
  mode (lexer_mode m, char ps, optional<const char*> esc)
  {
    const char* s1 (nullptr);
    const char* s2 (nullptr);
    bool s (true);
    bool n (true);
    bool q (true);

    if (!esc)
    {
      assert (!state_.empty ());
      esc = state_.top ().escapes;
    }

    switch (m)
    {
    case lexer_mode::normal:
      {
        s1 = ":<>=+ $(){}[]#\t\n";
        s2 = "    =           ";
        break;
      }
    case lexer_mode::value:
      {
        s1 = " $(){}[]#\t\n";
        s2 = "           ";
        break;
      }
    case lexer_mode::values:
      {
        s1 = " $(){}[],#\t\n";
        s2 = "            ";
        break;
      }
    case lexer_mode::switch_expressions:
      {
        s1 = " $(){}[],:#\t\n";
        s2 = "             ";
        break;
      }
    case lexer_mode::case_patterns:
      {
        s1 = " $(){}[],|:#\t\n";
        s2 = "              ";
        break;
      }
    case lexer_mode::attribute:
      {
        s1 = " $(]#\t\n";
        s2 = "       ";
        break;
      }
    case lexer_mode::eval:
      {
        s1 = ":<>=!&|?, $(){}[]#\t\n";
        s2 = "   = &|             ";
        break;
      }
    case lexer_mode::buildspec:
      {
        // Like the value mode with these differences:
        //
        // 1. Returns '(' as a separated token provided the state stack depth
        //    is less than or equal to 3 (initial state plus two buildspec)
        //    (see parse_buildspec() for details).
        //
        // 2. Recognizes comma.
        //
        // 3. Treat newline as an ordinary space.
        //
        s1 = " $(){}[],\t\n";
        s2 = "           ";
        n = false;
        break;
      }
    case lexer_mode::single_quoted:
    case lexer_mode::double_quoted:
      s = false;
      // Fall through.
    case lexer_mode::variable:
      {
        // These are handled in an ad hoc way in word().
        assert (ps == '\0');
        break;
      }
    default: assert (false); // Unhandled custom mode.
    }

    state_.push (state {m, ps, s, n, q, *esc, s1, s2});
  }

  token lexer::
  next ()
  {
    const state& st (state_.top ());
    lexer_mode m (st.mode);

    // For some modes we have dedicated imlementations of next().
    //
    switch (m)
    {
    case lexer_mode::normal:
    case lexer_mode::value:
    case lexer_mode::values:
    case lexer_mode::switch_expressions:
    case lexer_mode::case_patterns:
    case lexer_mode::attribute:
    case lexer_mode::variable:
    case lexer_mode::buildspec:     break;
    case lexer_mode::eval:          return next_eval ();
    case lexer_mode::double_quoted: return next_quoted ();
    default:                        assert (false); // Unhandled custom mode.
    }

    bool sep (skip_spaces ());

    xchar c (get ());
    uint64_t ln (c.line), cn (c.column);

    auto make_token = [&sep, ln, cn] (type t, string v = string ())
    {
      return token (t, move (v),
                    sep, quote_type::unquoted, false,
                    ln, cn, token_printer);
    };

    if (eos (c))
      return make_token (type::eos);

    // Handle pair separator.
    //
    if (c == st.sep_pair)
      return make_token (type::pair_separator, string (1, c));

    switch (c)
    {
      // NOTE: remember to update mode(), next_eval() if adding new special
      // characters.
      //
    case '\n':
      {
        // Expire value/values modes at the end of the line.
        //
        if (m == lexer_mode::value              ||
            m == lexer_mode::values             ||
            m == lexer_mode::switch_expressions ||
            m == lexer_mode::case_patterns)
          state_.pop ();

        sep = true; // Treat newline as always separated.
        return make_token (type::newline);
      }
    case '{': return make_token (type::lcbrace);
    case '}': return make_token (type::rcbrace);
    case '[': return make_token (type::lsbrace);
    case ']':
      {
        // Expire attribute mode after closing ']'.
        //
        if (m == lexer_mode::attribute)
          state_.pop ();

        return make_token (type::rsbrace);
      }
    case '$': return make_token (type::dollar);
    case ')': return make_token (type::rparen);
    case '(':
      {
        // Left paren is always separated in the buildspec mode.
        //
        if (m == lexer_mode::buildspec && state_.size () <= 3)
          sep = true;

        return make_token (type::lparen);
      }
    }

    // The following characters are special in the normal, variable, and
    // switch_expressions modes.
    //
    if (m == lexer_mode::normal             ||
        m == lexer_mode::variable           ||
        m == lexer_mode::switch_expressions ||
        m == lexer_mode::case_patterns)
    {
      switch (c)
      {
        // NOTE: remember to update mode(), next_eval() if adding new special
        // characters.
        //
      case ':': return make_token (type::colon);
      }
    }

    // The following characters are special in the normal and variable modes.
    //
    if (m == lexer_mode::normal || m == lexer_mode::variable)
    {
      switch (c)
      {
        // NOTE: remember to update mode(), next_eval() if adding new special
        // characters.
        //
      case '=':
        {
          if (peek () == '+')
          {
            get ();
            return make_token (type::prepend);
          }
          else
            return make_token (type::assign);
        }
      case '+':
        {
          if (peek () == '=')
          {
            get ();
            return make_token (type::append);
          }
        }
      }
    }

    // The following characters are special in the normal mode.
    //
    if (m == lexer_mode::normal)
    {
      // NOTE: remember to update mode() if adding new special characters.
      //
      switch (c)
      {
      case '<': return make_token (type::labrace);
      case '>': return make_token (type::rabrace);
      }
    }

    // The following characters are special in the values and buildspec mode.
    //
    if (m == lexer_mode::buildspec          ||
        m == lexer_mode::values             ||
        m == lexer_mode::switch_expressions ||
        m == lexer_mode::case_patterns)
    {
      // NOTE: remember to update mode() if adding new special characters.
      //
      switch (c)
      {
      case ',': return make_token (type::comma);
      }
    }

    // The following characters are special in the case_patterns mode.
    //
    if (m == lexer_mode::case_patterns)
    {
      // NOTE: remember to update mode() if adding new special characters.
      //
      switch (c)
      {
      case '|': return make_token (type::bit_or);
      }
    }

    // Otherwise it is a word.
    //
    unget (c);
    return word (st, sep);
  }

  token lexer::
  next_eval ()
  {
    bool sep (skip_spaces ());
    xchar c (get ());

    if (eos (c))
      fail (c) << "unterminated evaluation context";

    const state& st (state_.top ());

    uint64_t ln (c.line), cn (c.column);

    auto make_token = [sep, ln, cn] (type t, string v = string ())
    {
      return token (t, move (v),
                    sep, quote_type::unquoted, false,
                    ln, cn, token_printer);
    };

    // This mode is quite a bit like the value mode when it comes to special
    // characters, except that we have some of our own.
    //

    // Handle pair separator.
    //
    if (c == st.sep_pair)
      return make_token (type::pair_separator, string (1, c));

    // Note: we don't treat [ and ] as special here. Maybe can use them for
    // something later.
    //
    switch (c)
    {
      // NOTE: remember to update mode() if adding new special characters.
      //
    case '\n': fail (c) << "newline in evaluation context" << endf;
    case ':': return make_token (type::colon);
    case '{': return make_token (type::lcbrace);
    case '}': return make_token (type::rcbrace);
    case '[': return make_token (type::lsbrace);
    case ']': return make_token (type::rsbrace);
    case '$': return make_token (type::dollar);
    case '?': return make_token (type::question);
    case ',': return make_token (type::comma);
    case '(': return make_token (type::lparen);
    case ')':
      {
        state_.pop (); // Expire eval mode.
        return make_token (type::rparen);
      }
      // Potentially two-character tokens.
      //
    case '=':
    case '!':
    case '<':
    case '>':
    case '|':
    case '&':
      {
        xchar p (peek ());

        type r (type::eos);
        switch (c)
        {
        case '|': if (p == '|') r = type::log_or; break;
        case '&': if (p == '&') r = type::log_and; break;

        case '<': r = (p == '=' ? type::less_equal : type::less); break;
        case '>': r = (p == '=' ? type::greater_equal : type::greater); break;

        case '=': if (p == '=') r = type::equal; break;

        case '!': r = (p == '=' ? type::not_equal : type::log_not); break;
        }

        if (r == type::eos)
          break;

        switch (r)
        {
        case type::less:
        case type::greater:
        case type::log_not: break;
        default:            get ();
        }

        return make_token (r);
      }
    }

    // Otherwise it is a word.
    //
    unget (c);
    return word (st, sep);
  }

  token lexer::
  next_quoted ()
  {
    xchar c (get ());

    if (eos (c))
      fail (c) << "unterminated double-quoted sequence";

    uint64_t ln (c.line), cn (c.column);

    auto make_token = [ln, cn] (type t)
    {
      return token (t, false, quote_type::double_, ln, cn, token_printer);
    };

    switch (c)
    {
    case '$': return make_token (type::dollar);
    case '(': return make_token (type::lparen);
    }

    // Otherwise it is a word.
    //
    unget (c);
    return word (state_.top (), false);
  }

  token lexer::
  word (state st, bool sep)
  {
    lexer_mode m (st.mode);

    xchar c (peek ());
    assert (!eos (c));

    uint64_t ln (c.line), cn (c.column);

    string lexeme;
    quote_type qtype (m == lexer_mode::double_quoted
                      ? quote_type::double_
                      : quote_type::unquoted);

    // If we are already in the quoted mode then we didn't start with the
    // quote character.
    //
    bool qcomp (false);

    auto append = [&lexeme, &m, &qcomp] (char c)
    {
      lexeme += c;

      // An unquoted character after a quoted fragment.
      //
      if (qcomp && m != lexer_mode::double_quoted)
        qcomp = false;
    };

    for (; !eos (c); c = peek ())
    {
      // First handle escape sequences.
      //
      if (c == '\\')
      {
        // In the variable mode we treat the beginning of the escape sequence
        // as a separator (think \"$foo\").
        //
        if (m == lexer_mode::variable)
          break;

        get ();
        xchar p (peek ());

        const char* esc (st.escapes);

        if (esc == nullptr ||
            (*esc != '\0' && !eos (p) && strchr (esc, p) != nullptr))
        {
          get ();

          if (eos (p))
            fail (p) << "unterminated escape sequence";

          if (p != '\n') // Ignore if line continuation.
            append (p);

          continue;
        }
        else
          unget (c); // Treat as a normal character.
      }

      bool done (false);

      // Next take care of the double-quoted mode. This one is tricky since
      // we push/pop modes while accumulating the same lexeme for example:
      //
      // foo" bar "baz
      //
      if (m == lexer_mode::double_quoted)
      {
        switch (c)
        {
          // Only these two characters are special in the double-quoted mode.
          //
        case '$':
        case '(':
          {
            done = true;
            break;
          }
          // End quote.
          //
        case '\"':
          {
            get ();
            state_.pop ();

            st = state_.top ();
            m = st.mode;
            continue;
          }
        }
      }
      // We also handle the variable mode in an ad hoc way.
      //
      else if (m == lexer_mode::variable)
      {
        if (c != '_' && !(lexeme.empty () ? alpha (c) : alnum (c)))
        {
          if (c != '.')
            done = true;
          else
          {
            // Normally '.' is part of the variable (namespace separator)
            // unless it is trailing (think $major.$minor).
            //
            get ();
            xchar p (peek ());
            done = eos (p) || !(alpha (p) ||  p == '_');
            unget (c);
          }
        }
      }
      else
      {
        // First check if it's a pair separator.
        //
        if (c == st.sep_pair)
          done = true;
        else
        {
          // Then see if this character or character sequence is a separator.
          //
          for (const char* p (strchr (st.sep_first, c));
               p != nullptr;
               p = done ? nullptr : strchr (p + 1, c))
          {
            char s (st.sep_second[p - st.sep_first]);

            // See if it has a second.
            //
            if (s != ' ')
            {
              get ();
              done = (peek () == s);
              unget (c);
            }
            else
              done = true;
          }
        }

        // Handle single and double quotes if enabled for this mode and unless
        // they were considered separators.
        //
        if (st.quotes && !done)
        {
          switch (c)
          {
          case '\'':
            {
              // Enter the single-quoted mode in case the derived lexer needs
              // to notice this.
              //
              mode (lexer_mode::single_quoted);

              switch (qtype)
              {
              case quote_type::unquoted:
                qtype = quote_type::single;
                qcomp = lexeme.empty ();
                break;
              case quote_type::single:
                qcomp = false; // Non-contiguous.
                break;
              case quote_type::double_:
                qtype = quote_type::mixed;
                // Fall through.
              case quote_type::mixed:
                qcomp = false;
                break;
              }

              get ();
              for (c = get (); !eos (c) && c != '\''; c = get ())
                lexeme += c;

              if (eos (c))
                fail (c) << "unterminated single-quoted sequence";

              state_.pop ();
              continue;
            }
          case '\"':
            {
              get ();

              mode (lexer_mode::double_quoted);
              st = state_.top ();
              m = st.mode;

              switch (qtype)
              {
              case quote_type::unquoted:
                qtype = quote_type::double_;
                qcomp = lexeme.empty ();
                break;
              case quote_type::double_:
                qcomp = false; // Non-contiguous.
                break;
              case quote_type::single:
                qtype = quote_type::mixed;
                // Fall through.
              case quote_type::mixed:
                qcomp = false;
                break;
              }

              continue;
            }
          }
        }
      }

      if (done)
        break;

      get ();
      append (c);
    }

    if (m == lexer_mode::double_quoted)
    {
      if (eos (c))
        fail (c) << "unterminated double-quoted sequence";

      // If we are still in the quoted mode then we didn't end with the quote
      // character.
      //
      if (qcomp)
        qcomp = false;
    }

    // Expire variable mode at the end of the word.
    //
    if (m == lexer_mode::variable)
      state_.pop ();

    return token (move (lexeme), sep, qtype, qcomp, ln, cn);
  }

  bool lexer::
  skip_spaces ()
  {
    bool r (sep_);
    sep_ = false;

    const state& s (state_.top ());

    // In some special modes we don't skip spaces.
    //
    if (!s.sep_space)
      return r;

    xchar c (peek ());
    bool start (c.column == 1);

    for (; !eos (c); c = peek ())
    {
      switch (c)
      {
      case ' ':
      case '\t':
        {
          r = true;
          break;
        }
      case '\n':
        {
          // In some modes we treat newlines as ordinary spaces.
          //
          if (!s.sep_newline)
          {
            r = true;
            break;
          }

          // Skip empty lines.
          //
          if (start)
          {
            r = false;
            break;
          }

          return r;
        }
      case '#':
        {
          r = true;
          get ();

          // See if this is a multi-line comment in the form:
          //
          /*
             #\
             ...
             #\
          */
          auto ml = [&c, this] () -> bool
          {
            if ((c = peek ()) == '\\')
            {
              get ();
              if ((c = peek ()) == '\n')
                return true;
            }

            return false;
          };

          if (ml ())
          {
            // Scan until we see the closing one.
            //
            for (; !eos (c); c = peek ())
            {
              get ();
              if (c == '#' && ml ())
                break;
            }

            if (eos (c))
              fail (c) << "unterminated multi-line comment";
          }
          else
          {
            // Read until newline or eos.
            //
            for (; !eos (c) && c != '\n'; c = peek ())
              get ();
          }

          continue;
        }
      case '\\':
        {
          get ();

          if (peek () == '\n')
            break; // Ignore.

          unget (c);
        }
        // Fall through.
      default:
        return r; // Not a space.
      }

      get ();
    }

    return r;
  }
}