// file      : libbuild2/test/script/lexer.cxx -*- C++ -*-
// copyright : Copyright (c) 2014-2019 Code Synthesis Ltd
// license   : MIT; see accompanying LICENSE file

#include <libbuild2/test/script/lexer.hxx>

#include <cstring> // strchr()

using namespace std;

namespace build2
{
  namespace test
  {
    namespace script
    {
      using type = token_type;

      void lexer::
      mode (base_mode m, char ps, optional<const char*> esc)
      {
        const char* s1 (nullptr);
        const char* s2 (nullptr);
        bool s (true);
        bool n (true);
        bool q (true);

        if (!esc)
        {
          assert (!state_.empty ());
          esc = state_.top ().escapes;
        }

        switch (m)
        {
        case lexer_mode::command_line:
          {
            s1 = ":;=!|&<> $(#\t\n";
            s2 = "  ==          ";
            break;
          }
        case lexer_mode::first_token:
          {
            // First token on the script line. Like command_line but
            // recognizes leading '.+-{}' as tokens as well as variable
            // assignments as separators.
            //
            // Note that to recognize only leading '.+-{}' we shouldn't add
            // them to the separator strings.
            //
            s1 = ":;=+!|&<> $(#\t\n";
            s2 = "   ==          ";
            break;
          }
        case lexer_mode::second_token:
          {
            // Second token on the script line. Like command_line but
            // recognizes leading variable assignments.
            //
            // Note that to recognize only leading assignments we shouldn't
            // add them to the separator strings (so this is identical to
            // command_line).
            //
            s1 = ":;=!|&<> $(#\t\n";
            s2 = "  ==          ";
            break;
          }
        case lexer_mode::variable_line:
          {
            // Like value except we recognize ';' and don't recognize '{'.
            // Note that we don't recognize ':' since having a trailing
            // variable assignment is illegal.
            //
            s1 = "; $([]#\t\n";
            s2 = "         ";
            break;
          }

        case lexer_mode::command_expansion:
          {
            // Note that whitespaces are not word separators in this mode.
            //
            s1 = "|&<>";
            s2 = "    ";
            s = false;
            break;
          }
        case lexer_mode::here_line_single:
          {
            // This one is like a single-quoted string except it treats
            // newlines as a separator. We also treat quotes as literals.
            //
            // Note that it might be tempting to enable line continuation
            // escapes. However, we will then have to also enable escaping of
            // the backslash, which makes it a lot less tempting.
            //
            s1 = "\n";
            s2 = " ";
            esc = ""; // Disable escape sequences.
            s = false;
            q = false;
            break;
          }
        case lexer_mode::here_line_double:
          {
            // This one is like a double-quoted string except it treats
            // newlines as a separator. We also treat quotes as literals.
            //
            s1 = "$(\n";
            s2 = "   ";
            s = false;
            q = false;
            break;
          }
        case lexer_mode::description_line:
          {
            // This one is like a single-quoted string and has an ad hoc
            // implementation.
            //
            break;
          }
        default:
          {
            // Make sure pair separators are only enabled where we expect
            // them.
            //
            // @@ Should we disable pair separators in the eval mode?
            //
            assert (ps == '\0' ||
                    m == lexer_mode::eval ||
                    m == lexer_mode::attribute);

            base_lexer::mode (m, ps, esc);
            return;
          }
        }

        assert (ps == '\0');
        state_.push (state {m, ps, s, n, q, *esc, s1, s2});
      }

      token lexer::
      next ()
      {
        token r;

        switch (state_.top ().mode)
        {
        case lexer_mode::command_line:
        case lexer_mode::first_token:
        case lexer_mode::second_token:
        case lexer_mode::variable_line:
        case lexer_mode::command_expansion:
        case lexer_mode::here_line_single:
        case lexer_mode::here_line_double:
          r = next_line ();
          break;
        case lexer_mode::description_line:
          r = next_description ();
          break;
        default:
          r = base_lexer::next ();
          break;
        }

        if (r.qtype != quote_type::unquoted)
          ++quoted_;

        return r;
      }

      token lexer::
      next_line ()
      {
        bool sep (skip_spaces ());

        xchar c (get ());
        uint64_t ln (c.line), cn (c.column);

        if (eos (c))
          return token (type::eos, sep, ln, cn, token_printer);

        state st (state_.top ()); // Make copy (see first/second_token).
        lexer_mode m (st.mode);

        auto make_token = [&sep, &m, ln, cn] (type t, string v = string ())
        {
          bool q (m == lexer_mode::here_line_double);

          return token (t, move (v), sep,
                        (q ? quote_type::double_ : quote_type::unquoted), q,
                        ln, cn,
                        token_printer);
        };

        auto make_token_with_modifiers =
          [&make_token, this] (type t,
                               const char* mods,           // To recorgnize.
                               const char* stop = nullptr) // To stop after.
        {
          string v;
          if (mods != nullptr)
          {
            for (xchar p (peek ());
                 (strchr (mods, p) != nullptr &&      // Modifier.
                  strchr (v.c_str (), p) == nullptr); // Not already seen.
                 p = peek ())
            {
              get ();
              v += p;

              if (stop != nullptr && strchr (stop, p) != nullptr)
                break;
            }
          }

          return make_token (t, move (v));
        };

        // Expire certain modes at the end of the token. Do it early in case
        // we push any new mode (e.g., double quote).
        //
        if (m == lexer_mode::first_token || m == lexer_mode::second_token)
          state_.pop ();

        // NOTE: remember to update mode() if adding new special characters.

        if (m != lexer_mode::command_expansion)
        {
          switch (c)
          {
          case '\n':
            {
              // Expire variable value mode at the end of the line.
              //
              if (m == lexer_mode::variable_line)
                state_.pop ();

              sep = true; // Treat newline as always separated.
              return make_token (type::newline);
            }
          }
        }

        if (m != lexer_mode::here_line_single)
        {
          switch (c)
          {
            // Variable expansion, function call, and evaluation context.
            //
          case '$': return make_token (type::dollar);
          case '(': return make_token (type::lparen);
          }
        }


        if (m == lexer_mode::variable_line)
        {
          switch (c)
          {
            // Attributes.
            //
          case '[': return make_token (type::lsbrace);
          case ']': return make_token (type::rsbrace);
          }
        }

        // Line separators.
        //
        if (m == lexer_mode::command_line ||
            m == lexer_mode::first_token  ||
            m == lexer_mode::second_token ||
            m == lexer_mode::variable_line)
        {
          switch (c)
          {
          case ';': return make_token (type::semi);
          }
        }

        if (m == lexer_mode::command_line ||
            m == lexer_mode::first_token  ||
            m == lexer_mode::second_token)
        {
          switch (c)
          {
          case ':': return make_token (type::colon);
          }
        }

        // Command line operator/separators.
        //
        if (m == lexer_mode::command_line ||
            m == lexer_mode::first_token  ||
            m == lexer_mode::second_token)
        {
          switch (c)
          {
            // Comparison (==, !=).
            //
          case '=':
          case '!':
            {
              if (peek () == '=')
              {
                get ();
                return make_token (c == '=' ? type::equal : type::not_equal);
              }
            }
          }
        }

        // Command operators/separators.
        //
        if (m == lexer_mode::command_line ||
            m == lexer_mode::first_token  ||
            m == lexer_mode::second_token ||
            m == lexer_mode::command_expansion)
        {
          switch (c)
          {
            // |, ||
            //
          case '|':
            {
              if (peek () == '|')
              {
                get ();
                return make_token (type::log_or);
              }
              else
                return make_token (type::pipe);
            }
            // &, &&
            //
          case '&':
            {
              xchar p (peek ());

              if (p == '&')
              {
                get ();
                return make_token (type::log_and);
              }

              // These modifiers are mutually exclusive so stop after seeing
              // either one.
              //
              return make_token_with_modifiers (type::clean, "!?", "!?");
            }
            // <
            //
          case '<':
            {
              type r (type::in_str);
              xchar p (peek ());

              if (p == '|' || p == '-' || p == '<')
              {
                get ();

                switch (p)
                {
                case '|': return make_token (type::in_pass);
                case '-': return make_token (type::in_null);
                case '<':
                  {
                    r = type::in_doc;
                    p = peek ();

                    if (p == '<')
                    {
                      get ();
                      r = type::in_file;
                    }
                    break;
                  }
                }
              }

              // Handle modifiers.
              //
              const char* mods (nullptr);
              switch (r)
              {
              case type::in_str:
              case type::in_doc: mods = ":/"; break;
              }

              return make_token_with_modifiers (r, mods);
            }
            // >
            //
          case '>':
            {
              type r (type::out_str);
              xchar p (peek ());

              if (p == '|' || p == '-' || p == '!' || p == '&' ||
                  p == '=' || p == '+' || p == '>')
              {
                get ();

                switch (p)
                {
                case '|': return make_token (type::out_pass);
                case '-': return make_token (type::out_null);
                case '!': return make_token (type::out_trace);
                case '&': return make_token (type::out_merge);
                case '=': return make_token (type::out_file_ovr);
                case '+': return make_token (type::out_file_app);
                case '>':
                  {
                    r = type::out_doc;
                    p = peek ();

                    if (p == '>')
                    {
                      get ();
                      r = type::out_file_cmp;
                    }
                    break;
                  }
                }
              }

              // Handle modifiers.
              //
              const char* mods (nullptr);
              const char* stop (nullptr);
              switch (r)
              {
              case type::out_str:
              case type::out_doc:  mods = ":/~"; stop = "~"; break;
              }

              return make_token_with_modifiers (r, mods, stop);
            }
          }
        }

        // Dot, plus/minus, and left/right curly braces.
        //
        if (m == lexer_mode::first_token)
        {
          switch (c)
          {
          case '.': return make_token (type::dot);
          case '+': return make_token (type::plus);
          case '-': return make_token (type::minus);
          case '{': return make_token (type::lcbrace);
          case '}': return make_token (type::rcbrace);
          }
        }

        // Variable assignment (=, +=, =+).
        //
        if (m == lexer_mode::second_token)
        {
          switch (c)
          {
          case '=':
            {
              if (peek () == '+')
              {
                get ();
                return make_token (type::prepend);
              }
              else
                return make_token (type::assign);
            }
          case '+':
            {
              if (peek () == '=')
              {
                get ();
                return make_token (type::append);
              }
            }
          }
        }

        // Otherwise it is a word.
        //
        unget (c);
        return word (st, sep);
      }

      token lexer::
      next_description ()
      {
        xchar c (peek ());

        if (eos (c))
          fail (c) << "expected newline at the end of description line";

        uint64_t ln (c.line), cn (c.column);

        if (c == '\n')
        {
          get ();
          state_.pop (); // Expire the description mode.
          return token (type::newline, true, ln, cn, token_printer);
        }

        string lexeme;

        // For now no line continutions though we could support them.
        //
        for (; !eos (c) && c != '\n'; c = peek ())
        {
          get ();
          lexeme += c;
        }

        return token (move (lexeme),
                      false,
                      quote_type::unquoted, false,
                      ln, cn);
      }

      token lexer::
      word (state st, bool sep)
      {
        lexer_mode m (st.mode);

        // Customized implementation that handles special variable names ($*,
        // $N, $~, $@).
        //
        if (m != lexer_mode::variable)
          return base_lexer::word (st, sep);

        xchar c (peek ());

        if (c != '*' && c != '~' && c != '@' && !digit (c))
          return base_lexer::word (st, sep);

        get ();

        if (digit (c) && digit (peek ()))
          fail (c) << "multi-digit special variable name";

        state_.pop (); // Expire the variable mode.
        return token (string (1, c),
                      sep,
                      quote_type::unquoted, false,
                      c.line, c.column);
      }
    }
  }
}