Initial lexer implementation for buildfiles

author: Boris Kolpackov <boris@codesynthesis.com> 2014-12-11 13:57:42 +0200
committer: Boris Kolpackov <boris@codesynthesis.com> 2014-12-11 13:57:42 +0200
commit: e6d92a1fb21232ab09886431d39ccb8a95c7c68d (patch)
tree: 0d543e1e3c1b22e88f22f02e2dae75ae9eba2db5 /build/lexer.cxx
parent: fdc21950905d64b2ca1df5a0b2622022beffe922 (diff)
1 files changed, 220 insertions, 0 deletions
diff --git a/build/lexer.cxx b/build/lexer.cxx
new file mode 100644
index 0000000..101227e
--- /dev/null
+++ b/build/lexer.cxx
@@ -0,0 +1,220 @@
+// file      : build/lexer.cxx -*- C++ -*-
+// copyright : Copyright (c) 2014-2015 Code Synthesis Tools CC
+// license   : MIT; see accompanying LICENSE file
+
+#include <build/lexer>
+
+#include <iostream>
+
+using namespace std;
+
+namespace build
+{
+  token lexer::
+  next ()
+  {
+    skip_spaces ();
+
+    xchar c (get ());
+    uint64_t ln (c.line ()), cn (c.column ());
+
+    if (is_eos (c))
+      return token (ln, cn);
+
+    switch (c)
+    {
+      // NOTE: remember to update name() if adding new punctuations.
+      //
+    case '\n':
+      {
+        return token (token_punctuation::newline, ln, cn);
+      }
+    case ':':
+      {
+        return token (token_punctuation::colon, ln, cn);
+      }
+    case '{':
+      {
+        return token (token_punctuation::lcbrace, ln, cn);
+      }
+    case '}':
+      {
+        return token (token_punctuation::rcbrace, ln, cn);
+      }
+    }
+
+    // Otherwise it is a name.
+    //
+    return name (c);
+  }
+
+  lexer::xchar lexer::
+  escape ()
+  {
+    xchar c (get ());
+
+    if (!is_eos (c))
+      return c;
+
+    cerr << name_ << ':' << c.line () << ':' << c.column () << ": error: " <<
+      "unterminated escape sequence" << endl;
+    throw lexer_error ();
+  }
+
+  void lexer::
+  skip_spaces ()
+  {
+    xchar c (peek ());
+    bool start (c.column () == 1);
+
+    for (; !is_eos (c); c = peek ())
+    {
+      switch (c)
+      {
+      case ' ':
+      case '\t':
+        break;
+      case '\n':
+        {
+          // Skip empty lines.
+          //
+          if (start)
+            break;
+
+          return;
+        }
+      case '#':
+        {
+          get ();
+
+          // Read until newline or eos.
+          //
+          for (c = peek (); !is_eos (c) && c != '\n'; c = peek ())
+            get ();
+          continue;
+        }
+      case '\\':
+        {
+          get ();
+
+          if (peek () == '\n')
+            break;
+
+          unget (c);
+          // Fall through.
+        }
+      default:
+        return; // Not a space.
+      }
+
+      get ();
+    }
+  }
+
+  token lexer::
+  name (xchar c)
+  {
+    uint64_t ln (c.line ()), cn (c.column ());
+    string lexeme;
+    lexeme += (c != '\\' ? c : escape ());
+
+    for (c = peek (); !is_eos (c); c = peek ())
+    {
+      switch (c)
+      {
+      case ' ':
+      case '\t':
+      case '\n':
+      case ':':
+      case '{':
+      case '}':
+      case '#':
+        {
+          break;
+        }
+      case '\\':
+        {
+          get ();
+          lexeme += escape ();
+          continue;
+        }
+      default:
+        {
+          get ();
+          lexeme += c;
+          continue;
+        }
+      }
+
+      break;
+    }
+
+    return token (lexeme, ln, cn);
+  }
+
+  lexer::xchar lexer::
+  peek ()
+  {
+    if (unget_)
+      return buf_;
+    else
+    {
+      if (eos_)
+        return xchar (xchar::traits_type::eof (), l_, c_);
+      else
+      {
+        xchar::int_type v (is_.peek ());
+
+        if (v == xchar::traits_type::eof ())
+          eos_ = true;
+
+        return xchar (v, l_, c_);
+      }
+    }
+  }
+
+  lexer::xchar lexer::
+  get ()
+  {
+    if (unget_)
+    {
+      unget_ = false;
+      return buf_;
+    }
+    else
+    {
+      // When is_.get () returns eof, the failbit is also set (stupid,
+      // isn't?) which may trigger an exception. To work around this
+      // we will call peek() first and only call get() if it is not
+      // eof. But we can only call peek() on eof once; any subsequent
+      // calls will spoil the failbit (even more stupid).
+      //
+      xchar c (peek ());
+
+      if (!is_eos (c))
+      {
+        is_.get ();
+
+        if (c == '\n')
+        {
+          l_++;
+          c_ = 1;
+        }
+        else
+          c_++;
+      }
+
+      return c;
+    }
+  }
+
+  void lexer::
+  unget (const xchar& c)
+  {
+    // Because iostream::unget cannot work once eos is reached,
+    // we have to provide our own implementation.
+    //
+    buf_ = c;
+    unget_ = true;
+  }
+}
author	Boris Kolpackov <boris@codesynthesis.com>	2014-12-11 13:57:42 +0200
committer	Boris Kolpackov <boris@codesynthesis.com>	2014-12-11 13:57:42 +0200
commit	e6d92a1fb21232ab09886431d39ccb8a95c7c68d (patch)
tree	0d543e1e3c1b22e88f22f02e2dae75ae9eba2db5 /build/lexer.cxx
parent	fdc21950905d64b2ca1df5a0b2622022beffe922 (diff)