// file      : libbuild2/lexer.hxx -*- C++ -*-
// license   : MIT; see accompanying LICENSE file

#ifndef LIBBUILD2_LEXER_HXX
#define LIBBUILD2_LEXER_HXX

#include <stack>

#include <libbutl/utf8.hxx>
#include <libbutl/unicode.hxx>
#include <libbutl/char-scanner.hxx>

#include <libbuild2/types.hxx>
#include <libbuild2/utility.hxx>

#include <libbuild2/token.hxx>
#include <libbuild2/diagnostics.hxx>

#include <libbuild2/export.hxx>

namespace build2
{
  // Context-dependent lexing mode.
  //
  // Quoted modes are internal and should not be set explicitly. In the value
  // mode we don't treat certain characters (e.g., `+`, `=`) as special so
  // that we can use them in the variable values, e.g., `foo = g++`. In
  // contrast, in the variable mode, we restrict certain character (e.g., `/`)
  // from appearing in the name. The values mode is like value but recogizes
  // `,` as special (used in contexts where we need to list multiple
  // values). The attributes/attribute_value modes are like values where each
  // value is potentially a variable assignment; they don't treat `{` and `}`
  // as special (so we cannot have name groups in attributes) as well as
  // recognizes `=` and `]`. The subscript mode is like value but doesn't
  // treat `{` and `}` as special and recognizes `]`. The eval mode is used in
  // the evaluation context.
  //
  // A number of modes are "derived" from the value/values mode by recognizing
  // a few extra characters:
  //
  //   switch_expressions  values plus `:`
  //   case_patterns       values plus `|` and `:`
  //
  // Note that the normal, value/values and derived, as well as eval modes
  // split words separated by the pair character (to disable pairs one can
  // pass `\0` as a pair character).
  //
  // The normal mode recognizes `%` and `{{...` at the beginning of the line
  // as special. The cmdvar mode is like normal but does not treat these
  // character sequences as special.
  //
  // Finally, the foreign mode reads everything until encountering a line that
  // contains nothing (besides whitespaces) other than the closing multi-
  // curly-brace (`}}...`) (or eos) returning the contents as the word token
  // followed by the multi_rcbrace (or eos). In a way it is similar to the
  // single-quote mode. The number of closing braces to expect is passed as
  // mode data.
  //
  // The mode data is also used by a few other modes. The buildspec mode uses
  // it as a boolean value to decide whether to recognize newlines as tokens.
  // In the variable mode the mode data may be a pointer to a C string that
  // contains a list of special one-character variable names to recognize
  // (e.g., $<, $~, etc). Note that the parser has a special kludge to also
  // recognize them as $(<), etc.
  //
  // The alternative modes must be set manually. The value/values and derived
  // modes automatically expires after the end of the line. The attribute and
  // subscript modes expires after the closing `]`. The variable mode expires
  // after the word token. The eval mode expires after the closing `)`. And
  // the foreign mode expires after the closing braces.
  //
  // Note that normally it is only safe to switch mode when the current token
  // is not quoted (or, more generally, when you are not in the double-quoted
  // mode) unless the mode treats the double-quote as a separator (e.g.,
  // variable name mode). Failed that your mode (which now will be the top of
  // the mode stack) will prevent proper recognition of the closing quote.
  //
  // The `[` token is used for attributes (where it cuts across most of the
  // modes) as well as for value subscript (where it is only recognized after
  // expansions). It is handled with a flag. In the normal mode it is
  // automatically set at the beginning and after each newline. In all other
  // modes it must be explicitly set at points where attribute/subscript is
  // recognized. In all the cases it is automatically reset after lexing the
  // next token (whether `[` or not).

  // Extendable/inheritable enum-like class.
  //
  struct lexer_mode: lexer_mode_base
  {
    using base_type = lexer_mode_base;

    enum
    {
      normal = base_type::value_next,
      cmdvar,
      variable,
      value,
      values,
      case_patterns,
      switch_expressions,
      attributes,
      attribute_value,
      subscript,
      eval,
      single_quoted,
      double_quoted,
      foreign,
      buildspec,

      value_next
    };

    lexer_mode () = default;
    lexer_mode (value_type v): base_type (v) {}
    lexer_mode (base_type v): base_type (v) {}
  };

  class LIBBUILD2_SYMEXPORT lexer:
    public butl::char_scanner<butl::utf8_validator, 2>
  {
  public:
    // If escape is not NULL then only escape sequences with characters from
    // this string are considered "effective escapes" with all others passed
    // through as is. Note that neither the name nor escape arguments are
    // copied.
    //
    lexer (istream& is,
           const path_name& name,
           uint64_t line = 1, // Start line in the stream.
           const char* escapes = nullptr)
      : lexer (is, name, line, escapes, true /* set_mode */) {}

    const path_name&
    name () const {return name_;}

    // Note: sets mode for the next token. The second argument can be used to
    // specify the pair separator character (if the mode supports pairs). If
    // escapes is not specified, then inherit the current mode's (though a
    // mode can also override it).
    //
    virtual void
    mode (lexer_mode,
          char pair_separator = '\0',
          optional<const char*> escapes = nullopt,
          uintptr_t data = 0);

    // Enable `[` recognition for the next token.
    //
    void
    enable_lsbrace (bool unsep = false)
    {
      state_.top ().lsbrace = true;
      state_.top ().lsbrace_unsep = unsep;
    }

    // Expire the current mode early.
    //
    void
    expire_mode () {state_.pop ();}

    lexer_mode
    mode () const {return state_.top ().mode;}

    uintptr_t
    mode_data () const {return state_.top ().data;}

    char
    pair_separator () const {return state_.top ().sep_pair;}

    // Scanner. Note that it is ok to call next() again after getting eos.
    //
    // If you extend the lexer and add a custom lexer mode, then you must
    // override next() and handle the custom mode there.
    //
    virtual token
    next ();

    // Peek at the first two characters of the next token(s). Return the
    // characters or '\0' if either would be eos. Also return an indicator of
    // whether the next token would be separated. Note: cannot be used to peek
    // at the first character of a line.
    //
    // Note also that it assumes that the current mode and the potential new
    // mode in which these characters will actually be parsed use the same
    // whitespace separation (the sep_space and sep_newline values).
    //
    pair<pair<char, char>, bool>
    peek_chars ();

    // As base::get() but in case of an invalid character issue diagnostics
    // and throw failed.
    //
    xchar
    get ();

    // Get previously peeked character (faster).
    //
    void
    get (const xchar&);

    // As base::peek() but in case of an invalid character issue diagnostics
    // and throw failed.
    //
    xchar
    peek ();

  protected:
    struct state
    {
      lexer_mode      mode;
      uintptr_t       data;
      optional<token> hold;

      bool lsbrace;       // Recognize `[`.
      bool lsbrace_unsep; // Recognize it only if unseparated.

      char sep_pair;
      bool sep_space;    // Are whitespaces separators (see skip_spaces())?
      bool sep_newline;  // Is newline special (see skip_spaces())?
      bool quotes;       // Recognize quoted fragments.

      const char* escapes; // Effective escape sequences to recognize.

      // Word separator characters. For two-character sequence put the first
      // one in sep_first and the second one in the corresponding position of
      // sep_second. If it's a single-character sequence, then put space in
      // sep_second. If there are multiple sequences that start with the same
      // character, then repeat the first character in sep_first.
      //
      const char* sep_first;
      const char* sep_second;
    };

    token
    next_eval ();

    token
    next_quoted ();

    token
    next_foreign ();

    // Lex a word assuming current is the top state (which may already have
    // been "expired" from the top).
    //
    virtual token
    word (state current, bool separated);

    // Return true in first if we have seen any spaces. Skipped empty lines
    // don't count. In other words, we are only interested in spaces that are
    // on the same line as the following non-space character. Return true in
    // second if we have started skipping spaces from column 1 (note that
    // if this mode does not skip spaces, then second will always be false).
    //
    pair<bool, bool>
    skip_spaces ();

    // Diagnostics.
    //
  protected:
    fail_mark fail;

    [[noreturn]] void
    fail_char (const xchar&);

    // Lexer state.
    //
  protected:
    lexer (istream& is, const path_name& name, uint64_t line,
           const char* escapes,
           bool set_mode)
      : char_scanner (is,
                      butl::utf8_validator (butl::codepoint_types::graphic,
                                            U"\n\r\t"),
                      true /* crlf */,
                      line),
        fail ("error", &name),
        name_ (name),
        sep_ (false)
    {
      if (set_mode)
        mode (lexer_mode::normal, '@', escapes);
    }

    const path_name& name_;
    std::stack<state> state_;

    bool sep_; // True if we skipped spaces in peek().

  private:
    using base = char_scanner<butl::utf8_validator, 2>;

    // Buffer for a get()/peek() potential error.
    //
    string ebuf_;
  };
}

// Diagnostics plumbing.
//
namespace butl // ADL
{
  inline build2::location
  get_location (const butl::char_scanner<butl::utf8_validator, 2>::xchar& c,
                const void* data)
  {
    using namespace build2;

    assert (data != nullptr); // E.g., must be &lexer::name_.
    return location (*static_cast<const path_name*> (data), c.line, c.column);
  }
}

#include <libbuild2/lexer.ixx>

#endif // LIBBUILD2_LEXER_HXX