aboutsummaryrefslogtreecommitdiff
path: root/libbuild2/cc/lexer.cxx
diff options
context:
space:
mode:
Diffstat (limited to 'libbuild2/cc/lexer.cxx')
-rw-r--r--libbuild2/cc/lexer.cxx1129
1 files changed, 1129 insertions, 0 deletions
diff --git a/libbuild2/cc/lexer.cxx b/libbuild2/cc/lexer.cxx
new file mode 100644
index 0000000..6eba57e
--- /dev/null
+++ b/libbuild2/cc/lexer.cxx
@@ -0,0 +1,1129 @@
+// file : libbuild2/cc/lexer.cxx -*- C++ -*-
+// copyright : Copyright (c) 2014-2019 Code Synthesis Ltd
+// license : MIT; see accompanying LICENSE file
+
+#include <libbuild2/cc/lexer.hxx>
+
+using namespace std;
+using namespace butl;
+
+// bit 0 - identifier character (_0-9A-Ba-b).
+//
+static const uint8_t char_flags[256] =
+//0 1 2 3 4 5 6 7 8 9 A B C D E F
+{
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, // 3
+ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, // 5
+ 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, // 7
+
+ // 128-255
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0
+};
+
+// Diagnostics plumbing.
+//
+namespace butl // ADL
+{
+ inline build2::location
+ get_location (const butl::char_scanner::xchar& c, const void* data)
+ {
+ using namespace build2;
+
+ assert (data != nullptr); // E.g., must be &lexer::name_.
+ return location (static_cast<const path*> (data), c.line, c.column);
+ }
+}
+
+namespace build2
+{
+ namespace cc
+ {
+ auto lexer::
+ peek (bool e) -> xchar
+ {
+ if (unget_)
+ return ungetc_;
+
+ if (unpeek_)
+ return unpeekc_;
+
+ xchar c (base::peek ());
+
+ if (e && c == '\\')
+ {
+ get (c);
+ xchar p (base::peek ());
+
+ // Handle Windows CRLF sequence. Similar to char_scanner, we treat a
+ // single CR as if it was followed by LF and also collapse multiple
+ // CRs.
+ //
+ while (p == '\r')
+ {
+ get (p);
+ p = base::peek ();
+
+ if (p == '\n')
+ break;
+
+ // Pretend '\n' was there and recurse.
+ //
+ if (p != '\r')
+ return peek (e);
+ }
+
+ if (p == '\n')
+ {
+ get (p);
+ return peek (e); // Recurse.
+ }
+
+ // Save in the unpeek buffer so that it is returned on the subsequent
+ // calls to peek() (until get()).
+ //
+ unpeek_ = true;
+ unpeekc_ = c;
+ }
+
+ return c;
+ }
+
+ inline auto lexer::
+ get (bool e) -> xchar
+ {
+ if (unget_)
+ {
+ unget_ = false;
+ return ungetc_;
+ }
+ else
+ {
+ xchar c (peek (e));
+ get (c);
+ return c;
+ }
+ }
+
+ inline void lexer::
+ get (const xchar& c)
+ {
+ // Increment the logical line similar to how base will increment the
+ // physical (the column counts are the same).
+ //
+ if (log_line_ && c == '\n' && !unget_)
+ ++*log_line_;
+
+ base::get (c);
+ }
+
+ inline auto lexer::
+ geth (bool e) -> xchar
+ {
+ xchar c (get (e));
+ cs_.append (c);
+ return c;
+ }
+
+ inline void lexer::
+ geth (const xchar& c)
+ {
+ get (c);
+ cs_.append (c);
+ }
+
+ using type = token_type;
+
+ void lexer::
+ next (token& t, xchar c, bool ignore_pp)
+ {
+ for (;; c = skip_spaces ())
+ {
+ t.file = log_file_;
+ t.line = log_line_ ? *log_line_ : c.line;
+ t.column = c.column;
+
+ if (eos (c))
+ {
+ t.type = type::eos;
+ return;
+ }
+
+ const location l (&name_, c.line, c.column);
+
+ // Hash the token's line. The reason is debug info. In fact, doing
+ // this will make quite a few "noop" changes (like adding a newline
+ // anywhere in the source) cause the checksum change. But there
+ // doesn't seem to be any way around it: the case where we benefit
+ // from the precise change detection the most (development) is also
+ // where we will most likely have debug info enable.
+ //
+ // Note that in order not to make this completely useless we don't
+ // hash the column. Even if it is part of the debug info, having it a
+ // bit off shouldn't cause any significant mis-positioning. We also
+ // don't hash the file path for each token instead only hashing it
+ // when changed with the #line directive (as well as in the
+ // constructor for the initial path).
+ //
+ cs_.append (t.line);
+ cs_.append (c);
+
+ switch (c)
+ {
+ // Preprocessor lines.
+ //
+ case '#':
+ {
+ // It is tempting to simply scan until the newline ignoring
+ // anything in between. However, these lines can start a
+ // multi-line C-style comment. So we have to tokenize them (and
+ // hash the data for each token).
+ //
+ // Note that this may not work for things like #error that can
+ // contain pretty much anything. Also note that lines that start
+ // with '#' can contain '#' further down. In this case we need to
+ // be careful not to recurse (and consume multiple newlines). Thus
+ // the ignore_pp flag.
+ //
+ // Finally, to support diagnostics properly we need to recognize
+ // #line directives.
+ //
+ if (ignore_pp)
+ {
+ for (bool first (true);;)
+ {
+ // Note that we keep using the passed token for buffers.
+ //
+ c = skip_spaces (false); // Stop at newline.
+
+ if (eos (c) || c == '\n')
+ break;
+
+ if (first)
+ {
+ first = false;
+
+ // Recognize #line and its shorthand version:
+ //
+ // #line <integer> [<string literal>] ...
+ // # <integer> [<string literal>] ...
+ //
+ // Also diagnose #include while at it.
+ //
+ if (!(c >= '0' && c <= '9'))
+ {
+ next (t, c, false);
+
+ if (t.type == type::identifier)
+ {
+ if (t.value == "include")
+ fail (l) << "unexpected #include directive";
+ else if (t.value != "line")
+ continue;
+ }
+ else
+ continue;
+
+ if (t.type != type::identifier || t.value != "line")
+ continue;
+
+ c = skip_spaces (false);
+
+ if (!(c >= '0' && c <= '9'))
+ fail (c) << "line number expected after #line directive";
+ }
+
+ // Ok, this is #line and next comes the line number.
+ //
+ line_directive (t, c);
+ continue; // Parse the tail, if any.
+ }
+
+ next (t, c, false);
+ }
+ break;
+ }
+ else
+ {
+ t.type = type::punctuation;
+ return;
+ }
+ }
+ // Single-letter punctuation.
+ //
+ case ';': t.type = type::semi; return;
+ case '{': t.type = type::lcbrace; return;
+ case '}': t.type = type::rcbrace; return;
+ // Other single-letter punctuation.
+ //
+ case '(':
+ case ')':
+ case '[':
+ case ']':
+ case ',':
+ case '?':
+ case '~':
+ case '\\': t.type = type::punctuation; return;
+ // Potentially multi-letter punctuation.
+ //
+ case '.': // . .* .<N> ...
+ {
+ xchar p (peek ());
+
+ if (p == '*')
+ {
+ geth (p);
+ t.type = type::punctuation;
+ return;
+ }
+ else if (p >= '0' && p <= '9')
+ {
+ number_literal (t, c);
+ return;
+ }
+ else if (p == '.')
+ {
+ get (p);
+
+ xchar q (peek ());
+ if (q == '.')
+ {
+ cs_.append (p);
+
+ geth (q);
+ t.type = type::punctuation;
+ return;
+ }
+ unget (p);
+ // Fall through.
+ }
+
+ t.type = type::dot;
+ return;
+ }
+ case '=': // = ==
+ case '!': // ! !=
+ case '*': // * *=
+ case '/': // / /= (/* and // handled by skip_spaced() above)
+ case '%': // % %=
+ case '^': // ^ ^=
+ {
+ xchar p (peek ());
+
+ if (p == '=')
+ geth (p);
+
+ t.type = type::punctuation;
+ return;
+ }
+ case '<': // < <= << <<=
+ case '>': // > >= >> >>=
+ {
+ xchar p (peek ());
+
+ if (p == c)
+ {
+ geth (p);
+ if ((p = peek ()) == '=')
+ geth (p);
+ t.type = type::punctuation;
+ }
+ else if (p == '=')
+ {
+ geth (p);
+ t.type = type::punctuation;
+ }
+ else
+ t.type = (c == '<' ? type::less : type::greater);
+
+ return;
+ }
+ case '+': // + ++ +=
+ case '-': // - -- -= -> ->*
+ {
+ xchar p (peek ());
+
+ if (p == c || p == '=')
+ geth (p);
+ else if (c == '-' && p == '>')
+ {
+ geth (p);
+ if ((p = peek ()) == '*')
+ geth (p);
+ }
+
+ t.type = type::punctuation;
+ return;
+ }
+ case '&': // & && &=
+ case '|': // | || |=
+ {
+ xchar p (peek ());
+
+ if (p == c || p == '=')
+ geth (p);
+
+ t.type = type::punctuation;
+ return;
+ }
+ case ':': // : ::
+ {
+ xchar p (peek ());
+
+ if (p == ':')
+ geth (p);
+
+ t.type = type::punctuation;
+ return;
+ }
+ // Number (and also .<N> above).
+ //
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ {
+ number_literal (t, c);
+ return;
+ }
+ // Char/string literal, identifier, or other (\, $, @, `).
+ //
+ default:
+ {
+ bool raw (false); // Raw string literal.
+
+ // Note: known not to be a digit (see above).
+ //
+ if (char_flags[static_cast<uint8_t> (c)] & 0x01)
+ {
+ // This smells a little: we know skip_spaces() did not peek at
+ // the next character because this is not '/'. Which means the
+ // position in the stream must be of this character + 1.
+ //
+ t.position = buf_->tellg () - 1;
+
+ string& id (t.value);
+ id = c;
+
+ while (char_flags[static_cast<uint8_t> (c = peek ())] & 0x01)
+ {
+ geth (c);
+ id += c;
+
+ // Direct buffer scan. Note that we always follow up with the
+ // normal peek() call which may load the next chunk, handle
+ // line continuations, etc. In other words, the end of the
+ // "raw" scan doesn't necessarily mean the end.
+ //
+ const char* b (gptr_);
+ const char* p (b);
+
+ for (const char* e (egptr_);
+ p != e && char_flags[static_cast<uint8_t> (*p)] & 0x01;
+ ++p) ;
+
+ // Unrolling this loop doesn't make a difference.
+ //
+ // for (const char* e (egptr_ - 4); p < e; p += 4)
+ // {
+ // uint8_t c;
+ //
+ // c = static_cast<uint8_t> (p[0]);
+ // if (!(char_flags[c] & 0x01)) break;
+ //
+ // c = static_cast<uint8_t> (p[1]);
+ // if (!(char_flags[c] & 0x01)) {p += 1; break;}
+ //
+ // c = static_cast<uint8_t> (p[2]);
+ // if (!(char_flags[c] & 0x01)) {p += 2; break;}
+ //
+ // c = static_cast<uint8_t> (p[3]);
+ // if (!(char_flags[c] & 0x01)) {p += 3; break;}
+ // }
+
+ size_t n (p - b);
+ id.append (b, n); cs_.append (b, n);
+ gptr_ = p; buf_->gbump (static_cast<int> (n)); column += n;
+ }
+
+ // If the following character is a quote, see if the identifier
+ // is one of the literal prefixes.
+ //
+ if (c == '\'' || c == '\"')
+ {
+ size_t n (id.size ()), i (0);
+ switch (id[0])
+ {
+ case 'u':
+ {
+ if (n > 1 && id[1] == '8')
+ ++i;
+ }
+ // Fall through.
+ case 'L':
+ case 'U':
+ {
+ ++i;
+
+ if (c == '\"' && n > i && id[i] == 'R')
+ {
+ ++i;
+ raw = true;
+ }
+ break;
+ }
+ case 'R':
+ {
+ if (c == '\"')
+ {
+ ++i;
+ raw = true;
+ }
+ break;
+ }
+ }
+
+ if (i == n) // All characters "consumed".
+ {
+ geth (c);
+ id.clear ();
+ }
+ }
+
+ if (!id.empty ())
+ {
+ t.type = type::identifier;
+ return;
+ }
+ }
+
+ switch (c)
+ {
+ case '\'':
+ {
+ char_literal (t, c);
+ return;
+ }
+ case '\"':
+ {
+ if (raw)
+ raw_string_literal (t, c);
+ else
+ string_literal (t, c);
+ return;
+ }
+ default:
+ {
+ t.type = type::other;
+ return;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ void lexer::
+ number_literal (token& t, xchar c)
+ {
+ // note: c is hashed
+
+ // A number (integer or floating point literal) can:
+ //
+ // 1. Start with a dot (which must be followed by a digit, e.g., .123).
+ //
+ // 2. Can have a radix prefix (0b101, 0123, 0X12AB).
+ //
+ // 3. Can have an exponent (1e10, 0x1.p-10, 1.).
+ //
+ // 4. Digits can be separated with ' (123'456, 0xff00'00ff).
+ //
+ // 5. End with a built-in or user defined literal (123f, 123UL, 123_X)
+ //
+ // Quoting from GCC's preprocessor documentation:
+ //
+ // "Formally preprocessing numbers begin with an optional period, a
+ // required decimal digit, and then continue with any sequence of
+ // letters, digits, underscores, periods, and exponents. Exponents are
+ // the two-character sequences 'e+', 'e-', 'E+', 'E-', 'p+', 'p-', 'P+',
+ // and 'P-'."
+ //
+ // So it looks like a "C++ number" is then any unseparated (with
+ // whitespace or punctuation) sequence of those plus '. The only mildly
+ // tricky part is then to recognize +/- as being part of the exponent.
+ //
+ while (!eos ((c = peek ())))
+ {
+ switch (c)
+ {
+ // All the whitespace, punctuation, and other characters that end
+ // the number.
+ //
+ case ' ':
+ case '\n':
+ case '\t':
+ case '\r':
+ case '\f':
+ case '\v':
+
+ case '#':
+ case ';':
+ case '{':
+ case '}':
+ case '(':
+ case ')':
+ case '[':
+ case ']':
+ case ',':
+ case '?':
+ case '~':
+ case '=':
+ case '!':
+ case '*':
+ case '/':
+ case '%':
+ case '^':
+ case '>':
+ case '<':
+ case '&':
+ case '|':
+ case ':':
+ case '+': // The exponent case is handled below.
+ case '-': // The exponent case is handled below.
+ case '"':
+ case '\\':
+
+ case '@':
+ case '$':
+ case '`':
+ break;
+
+ // Recognize +/- after the exponent.
+ //
+ case 'e':
+ case 'E':
+ case 'p':
+ case 'P':
+ {
+ geth (c);
+ c = peek ();
+ if (c == '+' || c == '-')
+ geth (c);
+ continue;
+ }
+
+ case '_':
+ case '.':
+ case '\'':
+ default: // Digits and letters.
+ {
+ geth (c);
+ continue;
+ }
+ }
+
+ break;
+ }
+
+ t.type = type::number;
+ }
+
+ void lexer::
+ char_literal (token& t, xchar c)
+ {
+ // note: c is hashed
+
+ const location l (&name_, c.line, c.column);
+
+ for (char p (c);;) // Previous character (see below).
+ {
+ c = geth ();
+
+ if (eos (c) || c == '\n')
+ fail (l) << "unterminated character literal";
+
+ if (c == '\'' && p != '\\')
+ break;
+
+ // Keep track of \\-escapings so we don't confuse them with \', as in
+ // '\\'.
+ //
+ p = (c == '\\' && p == '\\') ? '\0' : static_cast<char> (c);
+ }
+
+ // See if we have a user-defined suffix (which is an identifier).
+ //
+ if ((c = peek ()) == '_' || alpha (c))
+ literal_suffix (c);
+
+ t.type = type::character;
+ }
+
+ void lexer::
+ string_literal (token& t, xchar c)
+ {
+ // note: c is hashed
+
+ const location l (&name_, c.line, c.column);
+
+ for (char p (c);;) // Previous character (see below).
+ {
+ c = geth ();
+
+ if (eos (c) || c == '\n')
+ fail (l) << "unterminated string literal";
+
+ if (c == '\"' && p != '\\')
+ break;
+
+ // Keep track of \\-escapings so we don't confuse them with \", as in
+ // "\\".
+ //
+ p = (c == '\\' && p == '\\') ? '\0' : static_cast<char> (c);
+
+ // Direct buffer scan.
+ //
+ if (p != '\\')
+ {
+ const char* b (gptr_);
+ const char* e (egptr_);
+ const char* p (b);
+
+ for (char c;
+ p != e && (c = *p) != '\"' && c != '\\' && c != '\n';
+ ++p) ;
+
+ size_t n (p - b);
+ cs_.append (b, n);
+ gptr_ = p; buf_->gbump (static_cast<int> (n)); column += n;
+ }
+ }
+
+ // See if we have a user-defined suffix (which is an identifier).
+ //
+ if ((c = peek ()) == '_' || alpha (c))
+ literal_suffix (c);
+
+ t.type = type::string;
+ }
+
+ void lexer::
+ raw_string_literal (token& t, xchar c)
+ {
+ // note: c is hashed
+
+ // The overall form is:
+ //
+ // R"<delimiter>(<raw_characters>)<delimiter>"
+ //
+ // Where <delimiter> is a potentially-empty character sequence made of
+ // any source character but parentheses, backslash and spaces. It can be
+ // at most 16 characters long.
+ //
+ // Note that the <raw_characters> are not processed in any way, not even
+ // for line continuations.
+ //
+ const location l (&name_, c.line, c.column);
+
+ // As a first step, parse the delimiter (including the openning paren).
+ //
+ string d (1, ')');
+
+ for (;;)
+ {
+ c = geth ();
+
+ if (eos (c) || c == '\"' || c == ')' || c == '\\' || c == ' ')
+ fail (l) << "invalid raw string literal";
+
+ if (c == '(')
+ break;
+
+ d += c;
+ }
+
+ d += '"';
+
+ // Now parse the raw characters while trying to match the closing
+ // delimiter.
+ //
+ for (size_t i (0);;) // Position to match in d.
+ {
+ c = geth (false); // No newline escaping.
+
+ if (eos (c)) // Note: newline is ok.
+ fail (l) << "invalid raw string literal";
+
+ if (c != d[i] && i != 0) // Restart from the beginning.
+ i = 0;
+
+ if (c == d[i])
+ {
+ if (++i == d.size ())
+ break;
+ }
+ }
+
+ // See if we have a user-defined suffix (which is an identifier).
+ //
+ if ((c = peek ()) == '_' || alpha (c))
+ literal_suffix (c);
+
+ t.type = type::string;
+ }
+
+ void lexer::
+ literal_suffix (xchar c)
+ {
+ // note: c is unhashed
+
+ // Parse a user-defined literal suffix identifier.
+ //
+ for (geth (c); (c = peek ()) == '_' || alnum (c); geth (c)) ;
+ }
+
+ void lexer::
+ line_directive (token& t, xchar c)
+ {
+ // enter: first digit of the line number
+ // leave: last character of the line number or file string
+ // note: c is unhashed
+
+ // If our number and string tokens contained the literal values, then we
+ // could have used that. However, we ignore the value (along with escape
+ // processing, etc), for performance. Let's keep it that way and instead
+ // handle it ourselves.
+ //
+ // Note also that we are not hashing these at the character level
+ // instead hashing the switch to a new file path below and leaving the
+ // line number to the token line hashing.
+ //
+ {
+ string& s (t.value);
+
+ for (s = c; (c = peek ()) >= '0' && c <= '9'; get (c))
+ s += c;
+
+ // The newline that ends the directive will increment the logical line
+ // so subtract one to compensate. Note: can't be 0 and shouldn't throw
+ // for valid lines.
+ //
+ log_line_ = stoull (s.c_str ()) - 1;
+ }
+
+ // See if we have the file.
+ //
+ c = skip_spaces (false);
+
+ if (c == '\"')
+ {
+ const location l (&name_, c.line, c.column);
+
+ // It is common to have a large number of #line directives that don't
+ // change the file (they seem to be used to track macro locations or
+ // some such). So we are going to optimize for this by comparing the
+ // current path to what's in #line.
+ //
+ string& s (tmp_file_);
+ s.clear ();
+
+ for (char p ('\0'); p != '\"'; ) // Previous character.
+ {
+ c = get ();
+
+ if (eos (c) || c == '\n')
+ fail (l) << "unterminated string literal";
+
+ // Handle escapes.
+ //
+ if (p == '\\')
+ {
+ p = '\0'; // Clear so we don't confuse \" and \\".
+
+ // We only handle what can reasonably be expected in a file name.
+ //
+ switch (c)
+ {
+ case '\\':
+ case '\'':
+ case '\"': break; // Add as is.
+ default:
+ fail (c) << "unsupported escape sequence in #line directive";
+ }
+ }
+ else
+ {
+ p = c;
+
+ switch (c)
+ {
+ case '\\':
+ case '\"': continue;
+ }
+ }
+
+ s += c;
+
+ // Direct buffer scan.
+ //
+ if (p != '\\')
+ {
+ const char* b (gptr_);
+ const char* e (egptr_);
+ const char* p (b);
+
+ for (char c;
+ p != e && (c = *p) != '\"' && c != '\\' && c != '\n';
+ ++p) ;
+
+ size_t n (p - b);
+ s.append (b, n);
+ gptr_ = p; buf_->gbump (static_cast<int> (n)); column += n;
+ }
+ }
+
+ if (log_file_.string () == s)
+ return;
+
+ // Swap the two string buffers.
+ //
+ {
+ string r (move (log_file_).string ()); // Move string rep out.
+ r.swap (s);
+ log_file_ = path (move (r)); // Move back in.
+ }
+
+ // If the path is relative, then prefix it with the current working
+ // directory. Failed that, we will end up with different checksums for
+ // invocations from different directories.
+ //
+ // While this should work fine for normal cross-compilation, it's an
+ // entirely different story for the emulated case (e.g., msvc-linux
+ // where the preprocessed output contains absolute Windows paths). So
+ // we try to sense if things look fishy and leave the path alone.
+ //
+ // Also detect special names like <built-in> and <command-line>. Plus
+ // GCC sometimes adds what looks like working directory (has trailing
+ // slash). So ignore that as well.
+ //
+ // We now switched to using absolute translation unit paths (because
+ // of __FILE__/assert(); see compile.cxx for details). But we might
+ // still need this logic when we try to calculate location-independent
+ // hash for distributed compilation/caching. The idea is to only hash
+ // the part starting from the project root which is immutable. Plus
+ // we will need -ffile-prefix-map to deal with __FILE__.
+ //
+ if (!log_file_.to_directory ())
+ cs_.append (log_file_.string ());
+#if 0
+ {
+ using tr = path::traits;
+ const string& f (log_file_.string ());
+
+ if (f.find (':') != string::npos ||
+ (f.front () == '<' && f.back () == '>') ||
+ log_file_.absolute ())
+ cs_.append (f);
+ else
+ {
+ // This gets complicated and slow: the path may contain '..' and
+ // '.' so strictly speaking we would need to normalize it.
+ // Instead, we are going to handle leading '..'s ourselves (the
+ // sane case) and ignore everything else (so if you have '..' or
+ // '.' somewhere in the middle, then things might not work
+ // optimally for you).
+ //
+ const string& d (work.string ());
+
+ // Iterate over leading '..' in f "popping" the corresponding
+ // number of trailing components from d.
+ //
+ size_t fp (0);
+ size_t dp (d.size () - 1);
+
+ for (size_t p;; )
+ {
+ // Note that in file we recognize any directory separator, not
+ // just of this platform (see note about emulation above).
+ //
+ if (f.compare (fp, 2, "..") != 0 ||
+ (f[fp + 2] != '/' && f[fp + 2] != '\\') || // Could be '\0'.
+ (p = tr::rfind_separator (d, dp)) == string::npos)
+ break;
+
+ fp += 3;
+ dp = p - 1;
+ }
+
+ cs_.append (d.c_str (), dp + 1);
+ cs_.append (tr::directory_separator); // Canonical in work.
+ cs_.append (f.c_str () + fp);
+ }
+ }
+#endif
+ }
+ else
+ unget (c);
+ }
+
+ auto lexer::
+ skip_spaces (bool nl) -> xchar
+ {
+ xchar c (get ());
+
+ for (; !eos (c); c = get ())
+ {
+ switch (c)
+ {
+ case '\n':
+ if (!nl) break;
+ // Fall through.
+ case ' ':
+ case '\t':
+ case '\r':
+ case '\f':
+ case '\v':
+ {
+ // Direct buffer scan.
+ //
+ const char* b (gptr_);
+ const char* e (egptr_);
+ const char* p (b);
+
+ for (char c;
+ p != e && ((c = *p) == ' ' || c == '\t');
+ ++p) ;
+
+ size_t n (p - b);
+ gptr_ = p; buf_->gbump (static_cast<int> (n)); column += n;
+
+ continue;
+ }
+ case '/':
+ {
+ xchar p (peek ());
+
+ // C++ comment.
+ //
+ if (p == '/')
+ {
+ get (p);
+
+ for (;;)
+ {
+ c = get ();
+ if (c == '\n' || eos (c))
+ break;
+
+ // Direct buffer scan.
+ //
+ const char* b (gptr_);
+ const char* e (egptr_);
+ const char* p (b);
+
+ for (char c;
+ p != e && (c = *p) != '\n' && c != '\\';
+ ++p) ;
+
+ size_t n (p - b);
+ gptr_ = p; buf_->gbump (static_cast<int> (n)); column += n;
+ }
+
+ if (!nl)
+ break;
+
+ continue;
+ }
+
+ // C comment.
+ //
+ if (p == '*')
+ {
+ get (p);
+
+ for (;;)
+ {
+ c = get ();
+
+ if (eos (c))
+ fail (p) << "unterminated comment";
+
+ if (c == '*' && (c = peek ()) == '/')
+ {
+ get (c);
+ break;
+ }
+
+ // Direct buffer scan.
+ //
+ const char* b (gptr_);
+ const char* e (egptr_);
+ const char* p (b);
+
+ for (char c;
+ p != e && (c = *p) != '*' && c != '\\';
+ ++p)
+ {
+ if (c == '\n')
+ {
+ if (log_line_) ++*log_line_;
+ ++line;
+ column = 1;
+ }
+ else
+ ++column;
+ }
+
+ gptr_ = p; buf_->gbump (static_cast<int> (p - b));
+ }
+ continue;
+ }
+ break;
+ }
+ }
+ break;
+ }
+
+ return c;
+ }
+
+ ostream&
+ operator<< (ostream& o, const token& t)
+ {
+ switch (t.type)
+ {
+ case type::dot: o << "'.'"; break;
+ case type::semi: o << "';'"; break;
+ case type::less: o << "'<'"; break;
+ case type::greater: o << "'>'"; break;
+ case type::lcbrace: o << "'{'"; break;
+ case type::rcbrace: o << "'}'"; break;
+ case type::punctuation: o << "<punctuation>"; break;
+
+ case type::identifier: o << '\'' << t.value << '\''; break;
+
+ case type::number: o << "<number literal>"; break;
+ case type::character: o << "<char literal>"; break;
+ case type::string: o << "<string literal>"; break;
+
+ case type::other: o << "<other>"; break;
+ case type::eos: o << "<end of file>"; break;
+ }
+
+ return o;
+ }
+ }
+}