Move cc build system module to separate library

author: Karen Arutyunov <karen@codesynthesis.com> 2019-08-24 17:41:30 +0300
committer: Karen Arutyunov <karen@codesynthesis.com> 2019-08-28 15:01:48 +0300
commit: 4bdf53837e010073de802070d4e6087410662d3e (patch)
tree: 2820d3964877d1a7d498833da325aa3d3a699353 /libbuild2/cc/lexer.hxx
parent: ea24f530048cbce0c5335ca3fd3632c8ce34315a (diff)
1 files changed, 190 insertions, 0 deletions
diff --git a/libbuild2/cc/lexer.hxx b/libbuild2/cc/lexer.hxx
new file mode 100644
index 0000000..cb2b3a5
--- /dev/null
+++ b/libbuild2/cc/lexer.hxx
@@ -0,0 +1,190 @@
+// file      : libbuild2/cc/lexer.hxx -*- C++ -*-
+// copyright : Copyright (c) 2014-2019 Code Synthesis Ltd
+// license   : MIT; see accompanying LICENSE file
+
+#ifndef LIBBUILD2_CC_LEXER_HXX
+#define LIBBUILD2_CC_LEXER_HXX
+
+#include <libbutl/sha256.mxx>
+#include <libbutl/char-scanner.mxx>
+
+#include <libbuild2/types.hxx>
+#include <libbuild2/utility.hxx>
+
+#include <libbuild2/diagnostics.hxx>
+
+namespace build2
+{
+  namespace cc
+  {
+    // Preprocessor-level tokenization of C/C++ source. In other words, the
+    // sequence of tokens returned is similar to what a real C/C++ compiler
+    // would see from its preprocessor.
+    //
+    // The input is a (partially-)preprocessed translation unit that may still
+    // contain comments, line continuations, and preprocessor directives such
+    // as #line, #pragma, but not #include (which is diagnosed). Currently,
+    // all preprocessor directives except #line are ignored and no values are
+    // saved from literals. The #line directive (and its shorthand notation)
+    // is recognized to provide the logical token location.
+    //
+    // While at it we also calculate the checksum of the input ignoring
+    // comments, whitespaces, etc. This is used to detect changes that do not
+    // alter the resulting token stream.
+    //
+    enum class token_type
+    {
+      // NOTE: remember to update operator<<() if changing anything here!
+      //
+      eos,
+
+      dot,         // .
+      semi,        // ;
+      less,        // <
+      greater,     // >
+      lcbrace,     // {
+      rcbrace,     // }
+
+      punctuation, // Other punctuation.
+
+      identifier,
+
+      number,      // Number literal.
+      character,   // Char   literal.
+      string,      // String literal.
+
+      other        // Other token.
+    };
+
+    struct token
+    {
+      token_type type = token_type::eos;
+      string     value;
+
+      // Logical position.
+      //
+      path     file;
+      uint64_t line   = 0;
+      uint64_t column = 0;
+
+      // Physical position in the stream, currently only for identifiers.
+      //
+      uint64_t position = 0;
+    };
+
+    // Output the token value in a format suitable for diagnostics.
+    //
+    ostream&
+    operator<< (ostream&, const token&);
+
+    class lexer: protected butl::char_scanner
+    {
+    public:
+      lexer (ifdstream& is, const path& name)
+          : char_scanner (is, false),
+            name_ (name),
+            fail ("error", &name_),
+            log_file_ (name) {}
+
+      const path&
+      name () const {return name_;}
+
+      string
+      checksum () const {return cs_.string ();}
+
+      // Note that it is ok to call next() again after getting eos.
+      //
+      token
+      next ()
+      {
+        token t;
+        next (t, skip_spaces (), true);
+        return t;
+      }
+
+      // As above but reuse the token to avoid a (potential) memory
+      // allocation. Typical usage:
+      //
+      // for (token t; l.next (t) != token_type::eos; )
+      //   ...
+      //
+      token_type
+      next (token& t)
+      {
+        next (t, skip_spaces (), true);
+        return t.type;
+      }
+
+    private:
+      void
+      next (token&, xchar, bool);
+
+      void
+      number_literal (token&, xchar);
+
+      void
+      char_literal (token&, xchar);
+
+      void
+      string_literal (token&, xchar);
+
+      void
+      raw_string_literal (token&, xchar);
+
+      void
+      literal_suffix (xchar);
+
+      void
+      line_directive (token&, xchar);
+
+      xchar
+      skip_spaces (bool newline = true);
+
+      // The char_scanner adaptation for newline escape sequence processing.
+      // Enabled by default and is only disabled in the raw string literals.
+      //
+    private:
+      using base = char_scanner;
+
+      xchar
+      peek (bool escape = true);
+
+      xchar
+      get (bool escape = true);
+
+      void
+      get (const xchar& peeked);
+
+      // Hashing versions.
+      //
+      xchar
+      geth (bool escape = true);
+
+      void
+      geth (const xchar& peeked);
+
+    private:
+      const path name_;
+      const fail_mark fail;
+
+      // Logical file and line as set by the #line directives. Note that the
+      // lexer diagnostics still uses the physical file/lines.
+      //
+      path               log_file_;
+      optional<uint64_t> log_line_;
+
+      string tmp_file_;
+      sha256 cs_;
+    };
+
+    // Diagnostics plumbing.
+    //
+    inline location
+    get_location (const token& t, const void* = nullptr)
+    {
+      return location (&t.file, t.line, t.column);
+    }
+  }
+}
+
+#endif // LIBBUILD2_CC_LEXER_HXX
author	Karen Arutyunov <karen@codesynthesis.com>	2019-08-24 17:41:30 +0300
committer	Karen Arutyunov <karen@codesynthesis.com>	2019-08-28 15:01:48 +0300
commit	4bdf53837e010073de802070d4e6087410662d3e (patch)
tree	2820d3964877d1a7d498833da325aa3d3a699353 /libbuild2/cc/lexer.hxx
parent	ea24f530048cbce0c5335ca3fd3632c8ce34315a (diff)