1 files changed, 1129 insertions, 0 deletions
diff --git a/libbuild2/cc/lexer.cxx b/libbuild2/cc/lexer.cxx
new file mode 100644
index 0000000..6eba57e
--- /dev/null
+++ b/libbuild2/cc/lexer.cxx
@@ -0,0 +1,1129 @@
+// file      : libbuild2/cc/lexer.cxx -*- C++ -*-
+// copyright : Copyright (c) 2014-2019 Code Synthesis Ltd
+// license   : MIT; see accompanying LICENSE file
+
+#include <libbuild2/cc/lexer.hxx>
+
+using namespace std;
+using namespace butl;
+
+// bit 0 - identifier character (_0-9A-Ba-b).
+//
+static const uint8_t char_flags[256] =
+//0    1    2    3    4    5    6    7      8    9    A    B    C    D    E    F
+{
+  0,   0,   0,   0,   0,   0,   0,   0,     0,   0,   0,   0,   0,   0,   0,   0, // 0
+  0,   0,   0,   0,   0,   0,   0,   0,     0,   0,   0,   0,   0,   0,   0,   0, // 1
+  0,   0,   0,   0,   0,   0,   0,   0,     0,   0,   0,   0,   0,   0,   0,   0, // 2
+  1,   1,   1,   1,   1,   1,   1,   1,     1,   1,   0,   0,   0,   0,   0,   0, // 3
+  0,   1,   1,   1,   1,   1,   1,   1,     1,   1,   1,   1,   1,   1,   1,   1, // 4
+  1,   1,   1,   1,   1,   1,   1,   1,     1,   1,   1,   0,   0,   0,   0,   1, // 5
+  0,   1,   1,   1,   1,   1,   1,   1,     1,   1,   1,   1,   1,   1,   1,   1, // 6
+  1,   1,   1,   1,   1,   1,   1,   1,     1,   1,   1,   0,   0,   0,   0,   0, // 7
+
+  // 128-255
+  0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,
+  0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,
+  0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,
+  0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0
+};
+
+// Diagnostics plumbing.
+//
+namespace butl // ADL
+{
+  inline build2::location
+  get_location (const butl::char_scanner::xchar& c, const void* data)
+  {
+    using namespace build2;
+
+    assert (data != nullptr); // E.g., must be &lexer::name_.
+    return location (static_cast<const path*> (data), c.line, c.column);
+  }
+}
+
+namespace build2
+{
+  namespace cc
+  {
+    auto lexer::
+    peek (bool e) -> xchar
+    {
+      if (unget_)
+        return ungetc_;
+
+      if (unpeek_)
+        return unpeekc_;
+
+      xchar c (base::peek ());
+
+      if (e && c == '\\')
+      {
+        get (c);
+        xchar p (base::peek ());
+
+        // Handle Windows CRLF sequence. Similar to char_scanner, we treat a
+        // single CR as if it was followed by LF and also collapse multiple
+        // CRs.
+        //
+        while (p == '\r')
+        {
+          get (p);
+          p = base::peek ();
+
+          if (p == '\n')
+            break;
+
+          // Pretend '\n' was there and recurse.
+          //
+          if (p != '\r')
+            return peek (e);
+        }
+
+        if (p == '\n')
+        {
+          get (p);
+          return peek (e); // Recurse.
+        }
+
+        // Save in the unpeek buffer so that it is returned on the subsequent
+        // calls to peek() (until get()).
+        //
+        unpeek_ = true;
+        unpeekc_ = c;
+      }
+
+      return c;
+    }
+
+    inline auto lexer::
+    get (bool e) -> xchar
+    {
+      if (unget_)
+      {
+        unget_ = false;
+        return ungetc_;
+      }
+      else
+      {
+        xchar c (peek (e));
+        get (c);
+        return c;
+      }
+    }
+
+    inline void lexer::
+    get (const xchar& c)
+    {
+      // Increment the logical line similar to how base will increment the
+      // physical (the column counts are the same).
+      //
+      if (log_line_ && c == '\n' && !unget_)
+        ++*log_line_;
+
+      base::get (c);
+    }
+
+    inline auto lexer::
+    geth (bool e) -> xchar
+    {
+      xchar c (get (e));
+      cs_.append (c);
+      return c;
+    }
+
+    inline void lexer::
+    geth (const xchar& c)
+    {
+      get (c);
+      cs_.append (c);
+    }
+
+    using type = token_type;
+
+    void lexer::
+    next (token& t, xchar c, bool ignore_pp)
+    {
+      for (;; c = skip_spaces ())
+      {
+        t.file = log_file_;
+        t.line = log_line_ ? *log_line_ : c.line;
+        t.column = c.column;
+
+        if (eos (c))
+        {
+          t.type = type::eos;
+          return;
+        }
+
+        const location l (&name_, c.line, c.column);
+
+        // Hash the token's line. The reason is debug info. In fact, doing
+        // this will make quite a few "noop" changes (like adding a newline
+        // anywhere in the source) cause the checksum change. But there
+        // doesn't seem to be any way around it: the case where we benefit
+        // from the precise change detection the most (development) is also
+        // where we will most likely have debug info enable.
+        //
+        // Note that in order not to make this completely useless we don't
+        // hash the column. Even if it is part of the debug info, having it a
+        // bit off shouldn't cause any significant mis-positioning. We also
+        // don't hash the file path for each token instead only hashing it
+        // when changed with the #line directive (as well as in the
+        // constructor for the initial path).
+        //
+        cs_.append (t.line);
+        cs_.append (c);
+
+        switch (c)
+        {
+          // Preprocessor lines.
+          //
+        case '#':
+          {
+            // It is tempting to simply scan until the newline ignoring
+            // anything in between. However, these lines can start a
+            // multi-line C-style comment. So we have to tokenize them (and
+            // hash the data for each token).
+            //
+            // Note that this may not work for things like #error that can
+            // contain pretty much anything. Also note that lines that start
+            // with '#' can contain '#' further down. In this case we need to
+            // be careful not to recurse (and consume multiple newlines). Thus
+            // the ignore_pp flag.
+            //
+            // Finally, to support diagnostics properly we need to recognize
+            // #line directives.
+            //
+            if (ignore_pp)
+            {
+              for (bool first (true);;)
+              {
+                // Note that we keep using the passed token for buffers.
+                //
+                c = skip_spaces (false); // Stop at newline.
+
+                if (eos (c) || c == '\n')
+                  break;
+
+                if (first)
+                {
+                  first = false;
+
+                  // Recognize #line and its shorthand version:
+                  //
+                  // #line <integer> [<string literal>] ...
+                  // #     <integer> [<string literal>] ...
+                  //
+                  // Also diagnose #include while at it.
+                  //
+                  if (!(c >= '0' && c <= '9'))
+                  {
+                    next (t, c, false);
+
+                    if (t.type == type::identifier)
+                    {
+                      if (t.value == "include")
+                        fail (l) << "unexpected #include directive";
+                      else if (t.value != "line")
+                        continue;
+                    }
+                    else
+                      continue;
+
+                    if (t.type != type::identifier || t.value != "line")
+                      continue;
+
+                    c = skip_spaces (false);
+
+                    if (!(c >= '0' && c <= '9'))
+                      fail (c) << "line number expected after #line directive";
+                  }
+
+                  // Ok, this is #line and next comes the line number.
+                  //
+                  line_directive (t, c);
+                  continue; // Parse the tail, if any.
+                }
+
+                next (t, c, false);
+              }
+              break;
+            }
+            else
+            {
+              t.type = type::punctuation;
+              return;
+            }
+          }
+          // Single-letter punctuation.
+          //
+        case ';': t.type = type::semi;    return;
+        case '{': t.type = type::lcbrace; return;
+        case '}': t.type = type::rcbrace; return;
+          // Other single-letter punctuation.
+          //
+        case '(':
+        case ')':
+        case '[':
+        case ']':
+        case ',':
+        case '?':
+        case '~':
+        case '\\': t.type = type::punctuation; return;
+          // Potentially multi-letter punctuation.
+          //
+        case '.': // . .* .<N> ...
+          {
+            xchar p (peek ());
+
+            if (p == '*')
+            {
+              geth (p);
+              t.type = type::punctuation;
+              return;
+            }
+            else if (p >= '0' && p <= '9')
+            {
+              number_literal (t, c);
+              return;
+            }
+            else if (p == '.')
+            {
+              get (p);
+
+              xchar q (peek ());
+              if (q == '.')
+              {
+                cs_.append (p);
+
+                geth (q);
+                t.type = type::punctuation;
+                return;
+              }
+              unget (p);
+              // Fall through.
+            }
+
+            t.type = type::dot;
+            return;
+          }
+        case '=': // = ==
+        case '!': // ! !=
+        case '*': // * *=
+        case '/': // / /=   (/* and // handled by skip_spaced() above)
+        case '%': // % %=
+        case '^': // ^ ^=
+          {
+            xchar p (peek ());
+
+            if (p == '=')
+              geth (p);
+
+            t.type = type::punctuation;
+            return;
+          }
+        case '<': // < <= << <<=
+        case '>': // > >= >> >>=
+          {
+            xchar p (peek ());
+
+            if (p == c)
+            {
+              geth (p);
+              if ((p = peek ()) == '=')
+                geth (p);
+              t.type = type::punctuation;
+            }
+            else if (p == '=')
+            {
+              geth (p);
+              t.type = type::punctuation;
+            }
+            else
+              t.type = (c == '<' ? type::less : type::greater);
+
+            return;
+          }
+        case '+': // + ++ +=
+        case '-': // - -- -= -> ->*
+          {
+            xchar p (peek ());
+
+            if (p == c || p == '=')
+              geth (p);
+            else if (c == '-' && p == '>')
+            {
+              geth (p);
+              if ((p = peek ()) == '*')
+                geth (p);
+            }
+
+            t.type = type::punctuation;
+            return;
+          }
+        case '&': // & && &=
+        case '|': // | || |=
+          {
+            xchar p (peek ());
+
+            if (p == c || p == '=')
+              geth (p);
+
+            t.type = type::punctuation;
+            return;
+          }
+        case ':': // : ::
+          {
+            xchar p (peek ());
+
+            if (p == ':')
+              geth (p);
+
+            t.type = type::punctuation;
+            return;
+          }
+          // Number (and also .<N> above).
+          //
+        case '0':
+        case '1':
+        case '2':
+        case '3':
+        case '4':
+        case '5':
+        case '6':
+        case '7':
+        case '8':
+        case '9':
+          {
+            number_literal (t, c);
+            return;
+          }
+          // Char/string literal, identifier, or other (\, $, @, `).
+          //
+        default:
+          {
+            bool raw (false); // Raw string literal.
+
+            // Note: known not to be a digit (see above).
+            //
+            if (char_flags[static_cast<uint8_t> (c)] & 0x01)
+            {
+              // This smells a little: we know skip_spaces() did not peek at
+              // the next character because this is not '/'. Which means the
+              // position in the stream must be of this character + 1.
+              //
+              t.position = buf_->tellg () - 1;
+
+              string& id (t.value);
+              id = c;
+
+              while (char_flags[static_cast<uint8_t> (c = peek ())] & 0x01)
+              {
+                geth (c);
+                id += c;
+
+                // Direct buffer scan. Note that we always follow up with the
+                // normal peek() call which may load the next chunk, handle
+                // line continuations, etc. In other words, the end of the
+                // "raw" scan doesn't necessarily mean the end.
+                //
+                const char* b (gptr_);
+                const char* p (b);
+
+                for (const char* e (egptr_);
+                     p != e && char_flags[static_cast<uint8_t> (*p)] & 0x01;
+                     ++p) ;
+
+                // Unrolling this loop doesn't make a difference.
+                //
+                // for (const char* e (egptr_ - 4); p < e; p += 4)
+                // {
+                //   uint8_t c;
+                //
+                //  c = static_cast<uint8_t> (p[0]);
+                //  if (!(char_flags[c] & 0x01)) break;
+                //
+                //  c = static_cast<uint8_t> (p[1]);
+                //  if (!(char_flags[c] & 0x01)) {p += 1; break;}
+                //
+                //  c = static_cast<uint8_t> (p[2]);
+                //  if (!(char_flags[c] & 0x01)) {p += 2; break;}
+                //
+                //  c = static_cast<uint8_t> (p[3]);
+                //  if (!(char_flags[c] & 0x01)) {p += 3; break;}
+                // }
+
+                size_t n (p - b);
+                id.append (b, n); cs_.append (b, n);
+                gptr_ = p; buf_->gbump (static_cast<int> (n)); column += n;
+              }
+
+              // If the following character is a quote, see if the identifier
+              // is one of the literal prefixes.
+              //
+              if (c == '\'' || c == '\"')
+              {
+                size_t n (id.size ()), i (0);
+                switch (id[0])
+                {
+                case 'u':
+                  {
+                    if (n > 1 && id[1] == '8')
+                      ++i;
+                  }
+                  // Fall through.
+                case 'L':
+                case 'U':
+                  {
+                    ++i;
+
+                    if (c == '\"' && n > i && id[i] == 'R')
+                    {
+                      ++i;
+                      raw = true;
+                    }
+                    break;
+                  }
+                case 'R':
+                  {
+                    if (c == '\"')
+                    {
+                      ++i;
+                      raw = true;
+                    }
+                    break;
+                  }
+                }
+
+                if (i == n) // All characters "consumed".
+                {
+                  geth (c);
+                  id.clear ();
+                }
+              }
+
+              if (!id.empty ())
+              {
+                t.type = type::identifier;
+                return;
+              }
+            }
+
+            switch (c)
+            {
+            case '\'':
+              {
+                char_literal (t, c);
+                return;
+              }
+            case '\"':
+              {
+                if (raw)
+                  raw_string_literal (t, c);
+                else
+                  string_literal (t, c);
+                return;
+              }
+            default:
+              {
+                t.type = type::other;
+                return;
+              }
+            }
+          }
+        }
+      }
+    }
+
+    void lexer::
+    number_literal (token& t, xchar c)
+    {
+      // note: c is hashed
+
+      // A number (integer or floating point literal) can:
+      //
+      // 1. Start with a dot (which must be followed by a digit, e.g., .123).
+      //
+      // 2. Can have a radix prefix (0b101, 0123, 0X12AB).
+      //
+      // 3. Can have an exponent (1e10, 0x1.p-10, 1.).
+      //
+      // 4. Digits can be separated with ' (123'456, 0xff00'00ff).
+      //
+      // 5. End with a built-in or user defined literal (123f, 123UL, 123_X)
+      //
+      // Quoting from GCC's preprocessor documentation:
+      //
+      // "Formally preprocessing numbers begin with an optional period, a
+      // required decimal digit, and then continue with any sequence of
+      // letters, digits, underscores, periods, and exponents. Exponents are
+      // the two-character sequences 'e+', 'e-', 'E+', 'E-', 'p+', 'p-', 'P+',
+      // and 'P-'."
+      //
+      // So it looks like a "C++ number" is then any unseparated (with
+      // whitespace or punctuation) sequence of those plus '. The only mildly
+      // tricky part is then to recognize +/- as being part of the exponent.
+      //
+      while (!eos ((c = peek ())))
+      {
+        switch (c)
+        {
+          // All the whitespace, punctuation, and other characters that end
+          // the number.
+          //
+        case ' ':
+        case '\n':
+        case '\t':
+        case '\r':
+        case '\f':
+        case '\v':
+
+        case '#':
+        case ';':
+        case '{':
+        case '}':
+        case '(':
+        case ')':
+        case '[':
+        case ']':
+        case ',':
+        case '?':
+        case '~':
+        case '=':
+        case '!':
+        case '*':
+        case '/':
+        case '%':
+        case '^':
+        case '>':
+        case '<':
+        case '&':
+        case '|':
+        case ':':
+        case '+': // The exponent case is handled below.
+        case '-': // The exponent case is handled below.
+        case '"':
+        case '\\':
+
+        case '@':
+        case '$':
+        case '`':
+          break;
+
+          // Recognize +/- after the exponent.
+          //
+        case 'e':
+        case 'E':
+        case 'p':
+        case 'P':
+          {
+            geth (c);
+            c = peek ();
+            if (c == '+' || c == '-')
+              geth (c);
+            continue;
+          }
+
+        case '_':
+        case '.':
+        case '\'':
+        default: // Digits and letters.
+          {
+            geth (c);
+            continue;
+          }
+        }
+
+        break;
+      }
+
+      t.type = type::number;
+    }
+
+    void lexer::
+    char_literal (token& t, xchar c)
+    {
+      // note: c is hashed
+
+      const location l (&name_, c.line, c.column);
+
+      for (char p (c);;) // Previous character (see below).
+      {
+        c = geth ();
+
+        if (eos (c) || c == '\n')
+          fail (l) << "unterminated character literal";
+
+        if (c == '\'' && p != '\\')
+          break;
+
+        // Keep track of \\-escapings so we don't confuse them with \', as in
+        // '\\'.
+        //
+        p = (c == '\\' && p == '\\') ? '\0' : static_cast<char> (c);
+      }
+
+      // See if we have a user-defined suffix (which is an identifier).
+      //
+      if ((c = peek ()) == '_' || alpha (c))
+        literal_suffix (c);
+
+      t.type = type::character;
+    }
+
+    void lexer::
+    string_literal (token& t, xchar c)
+    {
+      // note: c is hashed
+
+      const location l (&name_, c.line, c.column);
+
+      for (char p (c);;) // Previous character (see below).
+      {
+        c = geth ();
+
+        if (eos (c) || c == '\n')
+          fail (l) << "unterminated string literal";
+
+        if (c == '\"' && p != '\\')
+          break;
+
+        // Keep track of \\-escapings so we don't confuse them with \", as in
+        // "\\".
+        //
+        p = (c == '\\' && p == '\\') ? '\0' : static_cast<char> (c);
+
+        // Direct buffer scan.
+        //
+        if (p != '\\')
+        {
+          const char* b (gptr_);
+          const char* e (egptr_);
+          const char* p (b);
+
+          for (char c;
+               p != e && (c = *p) != '\"' && c != '\\' && c != '\n';
+               ++p) ;
+
+          size_t n (p - b);
+          cs_.append (b, n);
+          gptr_ = p; buf_->gbump (static_cast<int> (n)); column += n;
+        }
+      }
+
+      // See if we have a user-defined suffix (which is an identifier).
+      //
+      if ((c = peek ()) == '_' || alpha (c))
+        literal_suffix (c);
+
+      t.type = type::string;
+    }
+
+    void lexer::
+    raw_string_literal (token& t, xchar c)
+    {
+      // note: c is hashed
+
+      // The overall form is:
+      //
+      // R"<delimiter>(<raw_characters>)<delimiter>"
+      //
+      // Where <delimiter> is a potentially-empty character sequence made of
+      // any source character but parentheses, backslash and spaces. It can be
+      // at most 16 characters long.
+      //
+      // Note that the <raw_characters> are not processed in any way, not even
+      // for line continuations.
+      //
+      const location l (&name_, c.line, c.column);
+
+      // As a first step, parse the delimiter (including the openning paren).
+      //
+      string d (1, ')');
+
+      for (;;)
+      {
+        c = geth ();
+
+        if (eos (c) || c == '\"' || c == ')' || c == '\\' || c == ' ')
+          fail (l) << "invalid raw string literal";
+
+        if (c == '(')
+          break;
+
+        d += c;
+      }
+
+      d += '"';
+
+      // Now parse the raw characters while trying to match the closing
+      // delimiter.
+      //
+      for (size_t i (0);;) // Position to match in d.
+      {
+        c = geth (false); // No newline escaping.
+
+        if (eos (c)) // Note: newline is ok.
+          fail (l) << "invalid raw string literal";
+
+        if (c != d[i] && i != 0) // Restart from the beginning.
+          i = 0;
+
+        if (c == d[i])
+        {
+          if (++i == d.size ())
+            break;
+        }
+      }
+
+      // See if we have a user-defined suffix (which is an identifier).
+      //
+      if ((c = peek ()) == '_' || alpha (c))
+        literal_suffix (c);
+
+      t.type = type::string;
+    }
+
+    void lexer::
+    literal_suffix (xchar c)
+    {
+      // note: c is unhashed
+
+      // Parse a user-defined literal suffix identifier.
+      //
+      for (geth (c); (c = peek ()) == '_' || alnum (c); geth (c)) ;
+    }
+
+    void lexer::
+    line_directive (token& t, xchar c)
+    {
+      // enter: first digit of the line number
+      // leave: last character of the line number or file string
+      // note:  c is unhashed
+
+      // If our number and string tokens contained the literal values, then we
+      // could have used that. However, we ignore the value (along with escape
+      // processing, etc), for performance. Let's keep it that way and instead
+      // handle it ourselves.
+      //
+      // Note also that we are not hashing these at the character level
+      // instead hashing the switch to a new file path below and leaving the
+      // line number to the token line hashing.
+      //
+      {
+        string& s (t.value);
+
+        for (s = c; (c = peek ()) >= '0' && c <= '9'; get (c))
+          s += c;
+
+        // The newline that ends the directive will increment the logical line
+        // so subtract one to compensate. Note: can't be 0 and shouldn't throw
+        // for valid lines.
+        //
+        log_line_ = stoull (s.c_str ()) - 1;
+      }
+
+      // See if we have the file.
+      //
+      c = skip_spaces (false);
+
+      if (c == '\"')
+      {
+        const location l (&name_, c.line, c.column);
+
+        // It is common to have a large number of #line directives that don't
+        // change the file (they seem to be used to track macro locations or
+        // some such). So we are going to optimize for this by comparing the
+        // current path to what's in #line.
+        //
+        string& s (tmp_file_);
+        s.clear ();
+
+        for (char p ('\0'); p != '\"'; ) // Previous character.
+        {
+          c = get ();
+
+          if (eos (c) || c == '\n')
+            fail (l) << "unterminated string literal";
+
+          // Handle escapes.
+          //
+          if (p == '\\')
+          {
+            p = '\0'; // Clear so we don't confuse \" and \\".
+
+            // We only handle what can reasonably be expected in a file name.
+            //
+            switch (c)
+            {
+            case '\\':
+            case '\'':
+            case '\"': break; // Add as is.
+            default:
+              fail (c) << "unsupported escape sequence in #line directive";
+            }
+          }
+          else
+          {
+            p = c;
+
+            switch (c)
+            {
+            case '\\':
+            case '\"': continue;
+            }
+          }
+
+          s += c;
+
+          // Direct buffer scan.
+          //
+          if (p != '\\')
+          {
+            const char* b (gptr_);
+            const char* e (egptr_);
+            const char* p (b);
+
+            for (char c;
+                 p != e && (c = *p) != '\"' && c != '\\' && c != '\n';
+                 ++p) ;
+
+            size_t n (p - b);
+            s.append (b, n);
+            gptr_ = p; buf_->gbump (static_cast<int> (n)); column += n;
+          }
+        }
+
+        if (log_file_.string () == s)
+          return;
+
+        // Swap the two string buffers.
+        //
+        {
+          string r (move (log_file_).string ()); // Move string rep out.
+          r.swap (s);
+          log_file_ = path (move (r)); // Move back in.
+        }
+
+        // If the path is relative, then prefix it with the current working
+        // directory. Failed that, we will end up with different checksums for
+        // invocations from different directories.
+        //
+        // While this should work fine for normal cross-compilation, it's an
+        // entirely different story for the emulated case (e.g., msvc-linux
+        // where the preprocessed output contains absolute Windows paths). So
+        // we try to sense if things look fishy and leave the path alone.
+        //
+        // Also detect special names like <built-in> and <command-line>. Plus
+        // GCC sometimes adds what looks like working directory (has trailing
+        // slash). So ignore that as well.
+        //
+        // We now switched to using absolute translation unit paths (because
+        // of __FILE__/assert(); see compile.cxx for details). But we might
+        // still need this logic when we try to calculate location-independent
+        // hash for distributed compilation/caching. The idea is to only hash
+        // the part starting from the project root which is immutable. Plus
+        // we will need -ffile-prefix-map to deal with __FILE__.
+        //
+        if (!log_file_.to_directory ())
+          cs_.append (log_file_.string ());
+#if 0
+        {
+          using tr = path::traits;
+          const string& f (log_file_.string ());
+
+          if (f.find (':') != string::npos            ||
+              (f.front () == '<' && f.back () == '>') ||
+              log_file_.absolute ())
+            cs_.append (f);
+          else
+          {
+            // This gets complicated and slow: the path may contain '..' and
+            // '.'  so strictly speaking we would need to normalize it.
+            // Instead, we are going to handle leading '..'s ourselves (the
+            // sane case) and ignore everything else (so if you have '..'  or
+            // '.' somewhere in the middle, then things might not work
+            // optimally for you).
+            //
+            const string& d (work.string ());
+
+            // Iterate over leading '..' in f "popping" the corresponding
+            // number of trailing components from d.
+            //
+            size_t fp (0);
+            size_t dp (d.size () - 1);
+
+            for (size_t p;; )
+            {
+              // Note that in file we recognize any directory separator, not
+              // just of this platform (see note about emulation above).
+              //
+              if (f.compare (fp, 2, "..") != 0  ||
+                  (f[fp + 2] != '/' && f[fp + 2] != '\\') || // Could be '\0'.
+                  (p = tr::rfind_separator (d, dp)) == string::npos)
+                break;
+
+              fp += 3;
+              dp = p - 1;
+            }
+
+            cs_.append (d.c_str (), dp + 1);
+            cs_.append (tr::directory_separator); // Canonical in work.
+            cs_.append (f.c_str () + fp);
+          }
+        }
+#endif
+      }
+      else
+        unget (c);
+    }
+
+    auto lexer::
+    skip_spaces (bool nl) -> xchar
+    {
+      xchar c (get ());
+
+      for (; !eos (c); c = get ())
+      {
+        switch (c)
+        {
+        case '\n':
+          if (!nl) break;
+          // Fall through.
+        case ' ':
+        case '\t':
+        case '\r':
+        case '\f':
+        case '\v':
+          {
+            // Direct buffer scan.
+            //
+            const char* b (gptr_);
+            const char* e (egptr_);
+            const char* p (b);
+
+            for (char c;
+                 p != e && ((c = *p) == ' ' || c == '\t');
+                 ++p) ;
+
+            size_t n (p - b);
+            gptr_ = p; buf_->gbump (static_cast<int> (n)); column += n;
+
+            continue;
+          }
+        case '/':
+          {
+            xchar p (peek ());
+
+            // C++ comment.
+            //
+            if (p == '/')
+            {
+              get (p);
+
+              for (;;)
+              {
+                c = get ();
+                if (c == '\n' || eos (c))
+                  break;
+
+                // Direct buffer scan.
+                //
+                const char* b (gptr_);
+                const char* e (egptr_);
+                const char* p (b);
+
+                for (char c;
+                     p != e && (c = *p) != '\n' && c != '\\';
+                     ++p) ;
+
+                size_t n (p - b);
+                gptr_ = p; buf_->gbump (static_cast<int> (n)); column += n;
+              }
+
+              if (!nl)
+                break;
+
+              continue;
+            }
+
+            // C comment.
+            //
+            if (p == '*')
+            {
+              get (p);
+
+              for (;;)
+              {
+                c = get ();
+
+                if (eos (c))
+                  fail (p) << "unterminated comment";
+
+                if (c == '*' && (c = peek ()) == '/')
+                {
+                  get (c);
+                  break;
+                }
+
+                // Direct buffer scan.
+                //
+                const char* b (gptr_);
+                const char* e (egptr_);
+                const char* p (b);
+
+                for (char c;
+                     p != e && (c = *p) != '*' && c != '\\';
+                     ++p)
+                {
+                  if (c == '\n')
+                  {
+                    if (log_line_) ++*log_line_;
+                    ++line;
+                    column = 1;
+                  }
+                  else
+                    ++column;
+                }
+
+                gptr_ = p; buf_->gbump (static_cast<int> (p - b));
+              }
+              continue;
+            }
+            break;
+          }
+        }
+        break;
+      }
+
+      return c;
+    }
+
+    ostream&
+    operator<< (ostream& o, const token& t)
+    {
+      switch (t.type)
+      {
+      case type::dot:         o << "'.'";                   break;
+      case type::semi:        o << "';'";                   break;
+      case type::less:        o << "'<'";                   break;
+      case type::greater:     o << "'>'";                   break;
+      case type::lcbrace:     o << "'{'";                   break;
+      case type::rcbrace:     o << "'}'";                   break;
+      case type::punctuation: o << "<punctuation>";         break;
+
+      case type::identifier:  o << '\'' << t.value << '\''; break;
+
+      case type::number:      o << "<number literal>";      break;
+      case type::character:   o << "<char literal>";        break;
+      case type::string:      o << "<string literal>";      break;
+
+      case type::other:       o << "<other>";               break;
+      case type::eos:         o << "<end of file>";         break;
+      }
+
+      return o;
+    }
+  }
+}