1 files changed, 673 insertions, 0 deletions
diff --git a/libbuild2/parser.hxx b/libbuild2/parser.hxx
new file mode 100644
index 0000000..658f266
--- /dev/null
+++ b/libbuild2/parser.hxx
@@ -0,0 +1,673 @@
+// file      : libbuild2/parser.hxx -*- C++ -*-
+// copyright : Copyright (c) 2014-2019 Code Synthesis Ltd
+// license   : MIT; see accompanying LICENSE file
+
+#ifndef LIBBUILD2_PARSER_HXX
+#define LIBBUILD2_PARSER_HXX
+
+#include <stack>
+
+#include <libbuild2/types.hxx>
+#include <libbuild2/utility.hxx>
+
+#include <libbuild2/spec.hxx>
+#include <libbuild2/lexer.hxx>
+#include <libbuild2/token.hxx>
+#include <libbuild2/variable.hxx>
+#include <libbuild2/diagnostics.hxx>
+
+#include <libbuild2/export.hxx>
+
+namespace build2
+{
+  class scope;
+  class target;
+  class prerequisite;
+
+  class LIBBUILD2_SYMEXPORT parser
+  {
+  public:
+    // If boot is true, then we are parsing bootstrap.build and modules
+    // should only be bootstrapped.
+    //
+    explicit
+    parser (bool boot = false): fail ("error", &path_), boot_ (boot) {}
+
+    // Issue diagnostics and throw failed in case of an error.
+    //
+    void
+    parse_buildfile (istream&, const path& name, scope& root, scope& base);
+
+    buildspec
+    parse_buildspec (istream&, const path& name);
+
+    token
+    parse_variable (lexer&, scope&, const variable&, token_type kind);
+
+    pair<value, token>
+    parse_variable_value (lexer&, scope&, const dir_path*, const variable&);
+
+    names
+    parse_export_stub (istream& is, const path& p, scope& r, scope& b)
+    {
+      parse_buildfile (is, p, r, b);
+      return move (export_value_);
+    }
+
+    // Recursive descent parser.
+    //
+  protected:
+
+    // Pattern expansion mode.
+    //
+    enum class pattern_mode
+    {
+      ignore, // Treat as ordinary names.
+      detect, // Ignore pair/dir/type if the first name is a pattern.
+      expand  // Expand to ordinary names.
+    };
+
+    // If one is true then parse a single (logical) line (logical means it
+    // can actually be several lines, e.g., an if-block). Return false if
+    // nothing has been parsed (i.e., we are still on the same token).
+    //
+    // Note that after this function returns, the token is the first token of
+    // the next line (or eos).
+    //
+    bool
+    parse_clause (token&, token_type&, bool one = false);
+
+    void
+    parse_variable_block (token&, token_type&, const target_type*, string);
+
+    // Ad hoc target names inside < ... >.
+    //
+    struct adhoc_names_loc
+    {
+      names ns;
+      location loc;
+    };
+
+    using adhoc_names = small_vector<adhoc_names_loc, 1>;
+
+    void
+    enter_adhoc_members (adhoc_names_loc&&, bool);
+
+    small_vector<reference_wrapper<target>, 1>
+    enter_targets (names&&, const location&, adhoc_names&&, size_t);
+
+    bool
+    parse_dependency (token&, token_type&,
+                      names&&, const location&,
+                      adhoc_names&&,
+                      names&&, const location&,
+                      bool = false);
+
+    void
+    parse_assert (token&, token_type&);
+
+    void
+    parse_print (token&, token_type&);
+
+    void
+    parse_diag (token&, token_type&);
+
+    void
+    parse_dump (token&, token_type&);
+
+    void
+    parse_source (token&, token_type&);
+
+    void
+    parse_include (token&, token_type&);
+
+    void
+    parse_run (token&, token_type&);
+
+    void
+    parse_import (token&, token_type&);
+
+    void
+    parse_export (token&, token_type&);
+
+    void
+    parse_using (token&, token_type&);
+
+    void
+    parse_define (token&, token_type&);
+
+    void
+    parse_if_else (token&, token_type&);
+
+    void
+    parse_for (token&, token_type&);
+
+    void
+    parse_variable (token&, token_type&, const variable&, token_type);
+
+    void
+    parse_type_pattern_variable (token&, token_type&,
+                                 const target_type&, string,
+                                 const variable&, token_type, const location&);
+
+    const variable&
+    parse_variable_name (names&&, const location&);
+
+    // Note: calls attributes_push() that the caller must pop.
+    //
+    value
+    parse_variable_value (token&, token_type&);
+
+    void
+    apply_variable_attributes (const variable&);
+
+    void
+    apply_value_attributes (const variable*, // Optional.
+                            value& lhs,
+                            value&& rhs,
+                            token_type assign_kind);
+
+    // Return the value pack (values can be NULL/typed). Note that for an
+    // empty eval context ('()' potentially with whitespaces in between) the
+    // result is an empty pack, not a pack of one empty.
+    //
+    values
+    parse_eval (token&, token_type&, pattern_mode);
+
+    values
+    parse_eval_comma (token&, token_type&, pattern_mode, bool = false);
+
+    value
+    parse_eval_ternary (token&, token_type&, pattern_mode, bool = false);
+
+    value
+    parse_eval_or (token&, token_type&, pattern_mode, bool = false);
+
+    value
+    parse_eval_and (token&, token_type&, pattern_mode, bool = false);
+
+    value
+    parse_eval_comp (token&, token_type&, pattern_mode, bool = false);
+
+    value
+    parse_eval_value (token&, token_type&, pattern_mode, bool = false);
+
+    // Attributes stack. We can have nested attributes, for example:
+    //
+    // x = [bool] ([uint64] $x == [uint64] $y)
+    //
+    // In this example we only apply the value attributes after evaluating
+    // the context, which has its own attributes.
+    //
+    struct attributes
+    {
+      bool has;                         // Has attributes flag.
+      location loc;                     // Start of attributes location.
+      vector<pair<string, string>> ats; // Attributes.
+
+      explicit operator bool () const {return has;}
+    };
+
+    // Push a new entry into the attributes_ stack. If the next token is '['
+    // parse the attribute sequence until ']' storing the result in the new
+    // stack entry and setting the 'has' flag (unless the attribute list is
+    // empty). Then get the next token and, if standalone is false, verify
+    // it is not newline/eos (i.e., there is something after it). Return the
+    // indication of whether there are any attributes and their location.
+    //
+    // Note that during pre-parsing nothing is pushed into the stack and
+    // the returned attributes object indicates there are no attributes.
+    //
+    pair<bool, location>
+    attributes_push (token&, token_type&, bool standalone = false);
+
+    attributes
+    attributes_pop ()
+    {
+      assert (!pre_parse_);
+      attributes r (move (attributes_.top ()));
+      attributes_.pop ();
+      return r;
+    }
+
+    attributes&
+    attributes_top () {return attributes_.top ();}
+
+    // Source a stream optionnaly entering it as a buildfile and performing
+    // the default target processing.
+    //
+    void
+    source (istream&,
+            const path&,
+            const location&,
+            bool enter,
+            bool default_target);
+
+    // If chunk is true, then parse the smallest but complete, name-wise,
+    // chunk of input. Note that in this case you may still end up with
+    // multiple names, for example, {foo bar} or $foo. In the pre-parse mode
+    // always return empty list of names.
+    //
+    // The what argument is used in diagnostics (e.g., "expected <what>
+    // instead of ...".
+    //
+    // The separators argument specifies the special characters to recognize
+    // inside the name. These can be the directory separators and the '%'
+    // project separator. Note that even if it is NULL, the result may still
+    // contain non-simple names due to variable expansions.
+    //
+
+    static const string name_separators;
+
+    names
+    parse_names (token& t, token_type& tt,
+                 pattern_mode pmode,
+                 bool chunk = false,
+                 const char* what = "name",
+                 const string* separators = &name_separators)
+    {
+      names ns;
+      parse_names (t, tt,
+                   ns,
+                   pmode,
+                   chunk,
+                   what,
+                   separators,
+                   0,
+                   nullopt, nullptr, nullptr);
+      return ns;
+    }
+
+    // Return true if this token starts a name. Or, to put it another way,
+    // calling parse_names() on this token won't fail with the "expected name
+    // instead of <this-token>" error. Only consider '(' if the second
+    // argument is true.
+    //
+    bool
+    start_names (token_type&, bool lparen = true);
+
+    // As above but return the result as a value, which can be typed and NULL.
+    //
+    value
+    parse_value (token& t, token_type& tt,
+                 pattern_mode pmode,
+                 const char* what = "name",
+                 const string* separators = &name_separators,
+                 bool chunk = false)
+    {
+      names ns;
+      auto r (parse_names (t, tt,
+                           ns,
+                           pmode,
+                           chunk,
+                           what,
+                           separators,
+                           0,
+                           nullopt, nullptr, nullptr));
+
+      value v (r.type); // Potentially typed NULL value.
+
+      // This should not fail since we are typing the result of reversal from
+      // the typed value.
+      //
+      if (r.not_null)
+        v.assign (move (ns), nullptr);
+
+      return v;
+    }
+
+    // Append names and return the indication if the parsed value is not NULL
+    // and whether it is typed (and whether it is a pattern if pattern_mode is
+    // detect).
+    //
+    // You may have noticed that what we return here is essentially a value
+    // and doing it this way (i.e., reversing it to untyped names and
+    // returning its type so that it can potentially be "typed back") is kind
+    // of backwards. The reason we are doing it this way is because in many
+    // places we expect things untyped and if we were to always return a
+    // (potentially typed) value, then we would have to reverse it in all
+    // those places. Still it may make sense to look into redesigning the
+    // whole thing one day.
+    //
+    // Currently the only way for the result to be NULL or have a type is if
+    // it is the result of a sole, unquoted variable expansion, function call,
+    // or context evaluation.
+    //
+    struct parse_names_result
+    {
+      bool not_null;
+      const value_type* type;
+      optional<const target_type*> pattern;
+    };
+
+    parse_names_result
+    parse_names (token&, token_type&,
+                 names&,
+                 pattern_mode,
+                 bool chunk = false,
+                 const char* what = "name",
+                 const string* separators = &name_separators,
+                 size_t pairn = 0,
+                 const optional<project_name>& prj = nullopt,
+                 const dir_path* dir = nullptr,
+                 const string* type = nullptr,
+                 bool cross = true,
+                 bool curly = false);
+
+    size_t
+    parse_names_trailer (token&, token_type&,
+                         names&,
+                         pattern_mode,
+                         const char* what,
+                         const string* separators,
+                         size_t pairn,
+                         const optional<project_name>& prj,
+                         const dir_path* dir,
+                         const string* type,
+                         bool cross);
+
+    size_t
+    expand_name_pattern (const location&,
+                         names&&,
+                         names&,
+                         const char* what,
+                         size_t pairn,
+                         const dir_path* dir,
+                         const string* type,
+                         const target_type*);
+
+    size_t
+    splice_names (const location&,
+                  const names_view&,
+                  names&&,
+                  names&,
+                  const char* what,
+                  size_t pairn,
+                  const optional<project_name>& prj,
+                  const dir_path* dir,
+                  const string* type);
+
+    // Skip until newline or eos.
+    //
+    void
+    skip_line (token&, token_type&);
+
+    // Skip until block-closing } or eos, taking into account nested blocks.
+    //
+    void
+    skip_block (token&, token_type&);
+
+    // Return true if the name token can be considered a directive keyword.
+    //
+    bool
+    keyword (token&);
+
+    // Buildspec.
+    //
+    buildspec
+    parse_buildspec_clause (token&, token_type&, size_t);
+
+    // Customization hooks.
+    //
+  protected:
+    // If qual is not empty, then its pair member should indicate the kind
+    // of qualification: ':' -- target, '/' -- scope.
+    //
+    virtual lookup
+    lookup_variable (name&& qual, string&& name, const location&);
+
+    // Utilities.
+    //
+  protected:
+    class enter_scope;
+    class enter_target;
+    class enter_prerequisite;
+
+    // Switch to a new current scope. Note that this function might also have
+    // to switch to a new root scope if the new current scope is in another
+    // project. So both must be saved and restored.
+    //
+    void
+    switch_scope (const dir_path&);
+
+    void
+    process_default_target (token&);
+
+    // Enter buildfile as a target.
+    //
+    void
+    enter_buildfile (const path&);
+
+    // Lexer.
+    //
+  protected:
+    location
+    get_location (const token& t) const
+    {
+      return build2::get_location (t, *path_);
+    }
+
+    token_type
+    next (token&, token_type&);
+
+    // If the current token is newline, then get the next token. Otherwise,
+    // fail unless the current token is eos (i.e., optional newline at the end
+    // of stream). If the after argument is not \0, use it in diagnostics as
+    // the token after which the newline was expectd.
+    //
+    token_type
+    next_after_newline (token&, token_type&, char after = '\0');
+
+    // Be careful with peeking and switching the lexer mode. See keyword()
+    // for more information.
+    //
+    token_type
+    peek ();
+
+    token_type
+    peek (lexer_mode m, char ps = '\0')
+    {
+      // The idea is that if we already have something peeked, then it should
+      // be in the same mode. We also don't re-set the mode since it may have
+      // expired after the first token.
+      //
+      if (peeked_)
+      {
+        assert (peek_.mode == m);
+        return peek_.token.type;
+      }
+
+      mode (m, ps);
+      return peek ();
+    }
+
+    const token&
+    peeked () const
+    {
+      assert (peeked_);
+      return peek_.token;
+    }
+
+    void
+    mode (lexer_mode m, char ps = '\0')
+    {
+      if (replay_ != replay::play)
+        lexer_->mode (m, ps);
+      else
+        // As a sanity check, make sure the mode matches the next token. Note
+        // that we don't check the pair separator since it can be overriden by
+        // the lexer's mode() implementation.
+        //
+        assert (replay_i_ != replay_data_.size () &&
+                replay_data_[replay_i_].mode == m);
+    }
+
+    lexer_mode
+    mode () const
+    {
+      if (replay_ != replay::play)
+        return lexer_->mode ();
+      else
+      {
+        assert (replay_i_ != replay_data_.size ());
+        return replay_data_[replay_i_].mode;
+      }
+    }
+
+    void
+    expire_mode ()
+    {
+      if (replay_ != replay::play)
+        lexer_->expire_mode ();
+    }
+
+    // Token saving and replaying. Note that it can only be used in certain
+    // contexts. Specifically, the code that parses a replay must not interact
+    // with the lexer directly (e.g., the keyword() test). Replays also cannot
+    // nest. For now we don't enforce any of this.
+    //
+    // Note also that the peeked token is not part of the replay, until it
+    // is "got".
+    //
+    void
+    replay_save ()
+    {
+      assert (replay_ == replay::stop);
+      replay_ = replay::save;
+    }
+
+    void
+    replay_play ()
+    {
+      assert ((replay_ == replay::save && !replay_data_.empty ()) ||
+              (replay_ == replay::play && replay_i_ == replay_data_.size ()));
+
+      if (replay_ == replay::save)
+        replay_path_ = path_; // Save old path.
+
+      replay_i_ = 0;
+      replay_ = replay::play;
+    }
+
+    void
+    replay_stop ()
+    {
+      if (replay_ == replay::play)
+        path_ = replay_path_; // Restore old path.
+
+      replay_data_.clear ();
+      replay_ = replay::stop;
+    }
+
+    struct replay_guard
+    {
+      replay_guard (parser& p, bool start = true)
+          : p_ (start ? &p : nullptr)
+      {
+        if (p_ != nullptr)
+          p_->replay_save ();
+      }
+
+      void
+      play ()
+      {
+        if (p_ != nullptr)
+          p_->replay_play ();
+      }
+
+      ~replay_guard ()
+      {
+        if (p_ != nullptr)
+          p_->replay_stop ();
+      }
+
+    private:
+      parser* p_;
+    };
+
+    // Stop saving and get the data.
+    //
+    replay_tokens
+    replay_data ()
+    {
+      assert (replay_ == replay::save);
+
+      replay_tokens r (move (replay_data_));
+      replay_data_.clear ();
+      replay_ = replay::stop;
+      return r;
+    }
+
+    // Set the data and start playing.
+    //
+    void
+    replay_data (replay_tokens&& d)
+    {
+      assert (replay_ == replay::stop);
+
+      replay_path_ = path_; // Save old path.
+
+      replay_data_ = move (d);
+      replay_i_ = 0;
+      replay_ = replay::play;
+    }
+
+    // Implementation details, don't call directly.
+    //
+    replay_token
+    lexer_next ()
+    {
+      lexer_mode m (lexer_->mode ()); // Get it first since it may expire.
+      return replay_token {lexer_->next (), path_, m};
+    }
+
+    const replay_token&
+    replay_next ()
+    {
+      assert (replay_i_ != replay_data_.size ());
+      const replay_token& rt (replay_data_[replay_i_++]);
+
+      // Update the path. Note that theoretically it is possible that peeking
+      // at the next token will "change" the path of the current token. The
+      // workaround would be to call get_location() before peeking.
+      //
+      path_ = rt.file;
+
+      return rt;
+    }
+
+    // Diagnostics.
+    //
+  protected:
+    const fail_mark fail;
+
+  protected:
+    bool pre_parse_ = false;
+    bool boot_;
+
+    const path* path_; // Current path.
+    lexer*      lexer_;
+
+    prerequisite* prerequisite_ = nullptr; // Current prerequisite, if any.
+    target*       target_       = nullptr; // Current target, if any.
+    scope*        scope_        = nullptr; // Current base scope (out_base).
+    scope*        root_         = nullptr; // Current root scope (out_root).
+
+    const dir_path* pbase_ = nullptr; // Current pattern base directory.
+
+    std::stack<attributes> attributes_;
+
+    target* default_target_;
+    names export_value_;
+
+    replay_token peek_;
+    bool peeked_ = false;
+
+    enum class replay {stop, save, play} replay_ = replay::stop;
+    replay_tokens replay_data_;
+    size_t replay_i_;         // Position of the next token during replay.
+    const path* replay_path_; // Path before replay began (to be restored).
+  };
+}
+
+#endif // LIBBUILD2_PARSER_HXX