From e6d92a1fb21232ab09886431d39ccb8a95c7c68d Mon Sep 17 00:00:00 2001
From: Boris Kolpackov <boris@codesynthesis.com>
Date: Thu, 11 Dec 2014 13:57:42 +0200
Subject: Initial lexer implementation for buildfiles

---
 build/bd.cxx    |  62 ++++++++++++++++
 build/buildfile |   1 +
 build/lexer     |  98 +++++++++++++++++++++++++
 build/lexer.cxx | 220 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 build/target    |   3 +-
 build/token     |  55 ++++++++++++++
 6 files changed, 438 insertions(+), 1 deletion(-)
 create mode 100644 build/buildfile
 create mode 100644 build/lexer
 create mode 100644 build/lexer.cxx
 create mode 100644 build/token
diff --git a/build/bd.cxx b/build/bd.cxx
index c592d64..33ee02f 100644
--- a/build/bd.cxx
+++ b/build/bd.cxx
@@ -6,6 +6,7 @@
 
 #include <vector>
 #include <cassert>
+#include <fstream>
 #include <iostream>
 #include <typeinfo>
 #include <system_error>
@@ -15,6 +16,9 @@
 #include <build/process>
 #include <build/diagnostics>
 
+#include <build/token>
+#include <build/lexer>
+
 using namespace std;
 
 namespace build
@@ -132,6 +136,64 @@ main (int argc, char* argv[])
   //
   tzset ();
 
+  // Parse buildfile.
+  //
+  path bf ("buildfile");
+
+  ifstream ifs (bf.string ().c_str ());
+  if (!ifs.is_open ())
+  {
+    cerr << "error: unable to open " << bf << " in read mode" << endl;
+    return 1;
+  }
+
+  ifs.exceptions (ifstream::failbit | ifstream::badbit);
+  lexer l (ifs, bf.string ());
+
+  try
+  {
+    for (token t (l.next ());; t = l.next ())
+    {
+      cout << t.line () << ':' << t.column () << ": ";
+
+      switch (t.type ())
+      {
+      case token_type::eos: cout << "<eos>"; break;
+      case token_type::punctuation:
+        {
+          switch (t.punctuation ())
+          {
+          case token_punctuation::newline: cout << "\\n"; break;
+          case token_punctuation::colon:   cout << ':'; break;
+          case token_punctuation::lcbrace: cout << '{'; break;
+          case token_punctuation::rcbrace: cout << '}'; break;
+          }
+          break;
+        }
+      case token_type::name: cout << '\'' << t.name () << '\''; break;
+      }
+
+      cout << endl;
+
+      if (t.type () == token_type::eos)
+        break;
+    }
+  }
+  catch (const lexer_error&)
+  {
+    return 1; // Diagnostics has already been issued.
+  }
+  catch (const std::ios_base::failure&)
+  {
+    cerr << "error: failed to read from " << bf << endl;
+    return 1;
+  }
+
+  return 0;
+
+
+  // Register rules.
+  //
   cxx::link cxx_link;
   rules.emplace (typeid (exe), cxx_link);
 
diff --git a/build/buildfile b/build/buildfile
new file mode 100644
index 0000000..ceed236
--- /dev/null
+++ b/build/buildfile
@@ -0,0 +1 @@
+exe{bd}: obj{bd target}
diff --git a/build/lexer b/build/lexer
new file mode 100644
index 0000000..987efab
--- /dev/null
+++ b/build/lexer
@@ -0,0 +1,98 @@
+// file      : build/lexer -*- C++ -*-
+// copyright : Copyright (c) 2014-2015 Code Synthesis Tools CC
+// license   : MIT; see accompanying LICENSE file
+
+#ifndef BUILD_LEXER
+#define BUILD_LEXER
+
+#include <string>
+#include <iosfwd>
+#include <cstdint> // uint64_t
+#include <exception>
+
+#include <build/token>
+
+namespace build
+{
+  // The handler must assume the diagnostics has already been issued.
+  //
+  struct lexer_error: std::exception {};
+
+  class lexer
+  {
+  public:
+    lexer (std::istream& is, const std::string& name)
+        : is_ (is), name_ (name) {}
+
+    token
+    next ();
+
+    // Character interface.
+    //
+  private:
+    class xchar
+    {
+    public:
+      typedef std::char_traits<char> traits_type;
+      typedef traits_type::int_type int_type;
+      typedef traits_type::char_type char_type;
+
+      xchar (int_type v, std::uint64_t l, std::uint64_t c)
+          : v_ (v), l_ (l), c_ (c) {}
+
+      operator char_type () const {return static_cast<char_type> (v_);}
+
+      int_type
+      value () const {return v_;}
+
+      std::uint64_t line () const {return l_;}
+      std::uint64_t column () const {return c_;}
+
+    private:
+      int_type v_;
+      std::uint64_t l_;
+      std::uint64_t c_;
+    };
+
+    xchar
+    peek ();
+
+    xchar
+    get ();
+
+    void
+    unget (const xchar&);
+
+    // Tests.
+    //
+    bool
+    is_eos (const xchar& c) const
+    {
+      return c.value () == xchar::traits_type::eof ();
+    }
+
+  private:
+    xchar
+    escape ();
+
+    void
+    skip_spaces ();
+
+    token
+    name (xchar);
+
+  private:
+    std::istream& is_;
+    std::string name_;
+
+    std::uint64_t l_ {1};
+    std::uint64_t c_ {1};
+
+    bool eos_ {false};
+
+    bool unget_ {false};
+    xchar buf_ {0, 0, 0};
+  };
+}
+
+#endif // BUILD_LEXER
diff --git a/build/lexer.cxx b/build/lexer.cxx
new file mode 100644
index 0000000..101227e
--- /dev/null
+++ b/build/lexer.cxx
@@ -0,0 +1,220 @@
+// file      : build/lexer.cxx -*- C++ -*-
+// copyright : Copyright (c) 2014-2015 Code Synthesis Tools CC
+// license   : MIT; see accompanying LICENSE file
+
+#include <build/lexer>
+
+#include <iostream>
+
+using namespace std;
+
+namespace build
+{
+  token lexer::
+  next ()
+  {
+    skip_spaces ();
+
+    xchar c (get ());
+    uint64_t ln (c.line ()), cn (c.column ());
+
+    if (is_eos (c))
+      return token (ln, cn);
+
+    switch (c)
+    {
+      // NOTE: remember to update name() if adding new punctuations.
+      //
+    case '\n':
+      {
+        return token (token_punctuation::newline, ln, cn);
+      }
+    case ':':
+      {
+        return token (token_punctuation::colon, ln, cn);
+      }
+    case '{':
+      {
+        return token (token_punctuation::lcbrace, ln, cn);
+      }
+    case '}':
+      {
+        return token (token_punctuation::rcbrace, ln, cn);
+      }
+    }
+
+    // Otherwise it is a name.
+    //
+    return name (c);
+  }
+
+  lexer::xchar lexer::
+  escape ()
+  {
+    xchar c (get ());
+
+    if (!is_eos (c))
+      return c;
+
+    cerr << name_ << ':' << c.line () << ':' << c.column () << ": error: " <<
+      "unterminated escape sequence" << endl;
+    throw lexer_error ();
+  }
+
+  void lexer::
+  skip_spaces ()
+  {
+    xchar c (peek ());
+    bool start (c.column () == 1);
+
+    for (; !is_eos (c); c = peek ())
+    {
+      switch (c)
+      {
+      case ' ':
+      case '\t':
+        break;
+      case '\n':
+        {
+          // Skip empty lines.
+          //
+          if (start)
+            break;
+
+          return;
+        }
+      case '#':
+        {
+          get ();
+
+          // Read until newline or eos.
+          //
+          for (c = peek (); !is_eos (c) && c != '\n'; c = peek ())
+            get ();
+          continue;
+        }
+      case '\\':
+        {
+          get ();
+
+          if (peek () == '\n')
+            break;
+
+          unget (c);
+          // Fall through.
+        }
+      default:
+        return; // Not a space.
+      }
+
+      get ();
+    }
+  }
+
+  token lexer::
+  name (xchar c)
+  {
+    uint64_t ln (c.line ()), cn (c.column ());
+    string lexeme;
+    lexeme += (c != '\\' ? c : escape ());
+
+    for (c = peek (); !is_eos (c); c = peek ())
+    {
+      switch (c)
+      {
+      case ' ':
+      case '\t':
+      case '\n':
+      case ':':
+      case '{':
+      case '}':
+      case '#':
+        {
+          break;
+        }
+      case '\\':
+        {
+          get ();
+          lexeme += escape ();
+          continue;
+        }
+      default:
+        {
+          get ();
+          lexeme += c;
+          continue;
+        }
+      }
+
+      break;
+    }
+
+    return token (lexeme, ln, cn);
+  }
+
+  lexer::xchar lexer::
+  peek ()
+  {
+    if (unget_)
+      return buf_;
+    else
+    {
+      if (eos_)
+        return xchar (xchar::traits_type::eof (), l_, c_);
+      else
+      {
+        xchar::int_type v (is_.peek ());
+
+        if (v == xchar::traits_type::eof ())
+          eos_ = true;
+
+        return xchar (v, l_, c_);
+      }
+    }
+  }
+
+  lexer::xchar lexer::
+  get ()
+  {
+    if (unget_)
+    {
+      unget_ = false;
+      return buf_;
+    }
+    else
+    {
+      // When is_.get () returns eof, the failbit is also set (stupid,
+      // isn't?) which may trigger an exception. To work around this
+      // we will call peek() first and only call get() if it is not
+      // eof. But we can only call peek() on eof once; any subsequent
+      // calls will spoil the failbit (even more stupid).
+      //
+      xchar c (peek ());
+
+      if (!is_eos (c))
+      {
+        is_.get ();
+
+        if (c == '\n')
+        {
+          l_++;
+          c_ = 1;
+        }
+        else
+          c_++;
+      }
+
+      return c;
+    }
+  }
+
+  void lexer::
+  unget (const xchar& c)
+  {
+    // Because iostream::unget cannot work once eos is reached,
+    // we have to provide our own implementation.
+    //
+    buf_ = c;
+    unget_ = true;
+  }
+}
diff --git a/build/target b/build/target
index 3ef3192..01cddc4 100644
--- a/build/target
+++ b/build/target
@@ -11,6 +11,7 @@
 #include <typeindex>
 #include <iosfwd>
 #include <cassert>
+#include <utility>    // move
 
 #include <build/path>
 #include <build/timestamp>
@@ -27,7 +28,7 @@ namespace build
   class target
   {
   public:
-    target (std::string n): name_ (n) {}
+    target (std::string n): name_ (std::move (n)) {}
 
     const std::string&
     name () const {return name_;}
diff --git a/build/token b/build/token
new file mode 100644
index 0000000..bade45c
--- /dev/null
+++ b/build/token
@@ -0,0 +1,55 @@
+// file      : build/token -*- C++ -*-
+// copyright : Copyright (c) 2014-2015 Code Synthesis Tools CC
+// license   : MIT; see accompanying LICENSE file
+
+#ifndef BUILD_TOKEN
+#define BUILD_TOKEN
+
+#include <string>
+#include <cstddef> // size_t
+#include <cstdint> // uint64_t
+#include <cassert>
+#include <utility> // move
+
+namespace build
+{
+  enum class token_type {eos, name, punctuation};
+  enum class token_punctuation {newline, colon, lcbrace, rcbrace};
+
+  class token
+  {
+  public:
+    token_type
+    type () const {return t_;}
+
+    std::string const&
+    name () const {assert (t_ == token_type::name); return n_;}
+
+    token_punctuation
+    punctuation () const {assert (t_ == token_type::punctuation); return p_;}
+
+    std::uint64_t line () const {return l_;}
+    std::uint64_t column () const {return c_;}
+
+  public:
+    token (std::uint64_t l, std::uint64_t c)
+        : t_ (token_type::eos), l_ (l), c_ (c) {}
+
+    token (std::string n, std::uint64_t l, std::uint64_t c)
+        : t_ (token_type::name), n_ (std::move (n)), l_ (l), c_ (c) {}
+
+    token (token_punctuation p, std::uint64_t l, std::uint64_t c)
+        : t_ (token_type::punctuation), p_ (p), l_ (l), c_ (c) {}
+
+  private:
+    token_type t_;
+
+    token_punctuation p_;
+    std::string n_;
+
+    std::uint64_t l_;
+    std::uint64_t c_;
+  };
+}
+
+#endif // BUILD_TOKEN
-- 
cgit v1.1