aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBoris Kolpackov <boris@codesynthesis.com>2017-05-24 13:24:31 +0200
committerBoris Kolpackov <boris@codesynthesis.com>2017-05-24 13:24:31 +0200
commit0cef93b4e2e9bf39b0ca542876f9ab1af6d0f01d (patch)
tree187b83b65f28cdf4f8a2b0feadf392b49554fbf3
parentb3526a5c925169b3be00a5dd4d8c8222f3a475cd (diff)
Implement support for tokenization of preprocessed C/C++ source
-rw-r--r--build2/buildfile1
-rw-r--r--build2/cc/lexer.cxx683
-rw-r--r--build2/cc/lexer.hxx166
-rw-r--r--unit-tests/cc/lexer/buildfile17
-rw-r--r--unit-tests/cc/lexer/char-literal.test67
-rw-r--r--unit-tests/cc/lexer/comment.test88
-rw-r--r--unit-tests/cc/lexer/driver.cxx66
-rw-r--r--unit-tests/cc/lexer/line.test67
-rw-r--r--unit-tests/cc/lexer/number.test48
-rw-r--r--unit-tests/cc/lexer/preprocessor.test38
-rw-r--r--unit-tests/cc/lexer/raw-string-literal.test90
-rw-r--r--unit-tests/cc/lexer/string-literal.test65
12 files changed, 1396 insertions, 0 deletions
diff --git a/build2/buildfile b/build2/buildfile
index 2d65001..69dfc94 100644
--- a/build2/buildfile
+++ b/build2/buildfile
@@ -54,6 +54,7 @@ exe{b}: \
cc/{hxx cxx}{ guess } \
cc/{hxx cxx}{ init } \
cc/{hxx cxx}{ install } \
+ cc/{hxx cxx}{ lexer } \
cc/{hxx cxx}{ link } \
cc/{hxx cxx}{ module } \
cc/{ cxx}{ msvc } \
diff --git a/build2/cc/lexer.cxx b/build2/cc/lexer.cxx
new file mode 100644
index 0000000..3eb5d5b
--- /dev/null
+++ b/build2/cc/lexer.cxx
@@ -0,0 +1,683 @@
+// file : build2/cc/lexer.cxx -*- C++ -*-
+// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+// license : MIT; see accompanying LICENSE file
+
+#include <build2/cc/lexer.hxx>
+
+using namespace std;
+using namespace butl;
+
+// Diagnostics plumbing.
+//
+namespace butl // ADL
+{
+ inline build2::location
+ get_location (const butl::char_scanner::xchar& c, const void* data)
+ {
+ using namespace build2;
+
+ assert (data != nullptr); // E.g., must be &lexer::name_.
+ return location (static_cast<const path*> (data), c.line, c.column);
+ }
+}
+
+namespace build2
+{
+ namespace cc
+ {
+ inline auto lexer::
+ get (bool e) -> xchar
+ {
+ if (unget_)
+ {
+ unget_ = false;
+ return ungetc_;
+ }
+ else
+ {
+ xchar c (peek (e));
+ base::get (c);
+ return c;
+ }
+ }
+
+ auto lexer::
+ peek (bool e) -> xchar
+ {
+ if (unget_)
+ return ungetc_;
+
+ if (unpeek_)
+ return unpeekc_;
+
+ xchar c (base::peek ());
+
+ if (e && c == '\\')
+ {
+ base::get (c);
+ xchar p (base::peek ());
+
+ if (p == '\n')
+ {
+ base::get (p);
+ return peek (e); // Recurse.
+ }
+
+ // Save in the unpeek buffer so that it is returned on the subsequent
+ // calls to peek() (until get()).
+ //
+ unpeek_ = true;
+ unpeekc_ = c;
+ }
+
+ return c;
+ }
+
+ using type = token_type;
+
+ void lexer::
+ next (token& t, xchar c)
+ {
+ for (;; c = skip_spaces ())
+ {
+ t.line = c.line;
+ t.column = c.column;
+
+ if (eos (c))
+ {
+ t.type = type::eos;
+ return;
+ }
+
+ switch (c)
+ {
+ // Preprocessor lines.
+ //
+ case '#':
+ {
+ // It is tempting to simply scan until the newline ignoring
+ // anything in between. However, these lines can start a
+ // multi-line C-style comment. So we have to tokenize it. Note
+ // that we assume there cannot be #include directives.
+ //
+ // This may not work for things like #error that can contain
+ // pretty much anything.
+ //
+ for (;;)
+ {
+ c = skip_spaces (false); // Stop at newline.
+
+ if (eos (c) || c == '\n')
+ break;
+
+ next (t, c); // Keep using the passed token for buffers.
+ }
+ break;
+ }
+ // Single-letter punctuation.
+ //
+ case ';': t.type = type::semi; return;
+ case '{': t.type = type::lcbrace; return;
+ case '}': t.type = type::rcbrace; return;
+ // Other single-letter punctuation.
+ //
+ case '(':
+ case ')':
+ case '[':
+ case ']':
+ case ',':
+ case '?':
+ case '~':
+ case '\\': t.type = type::punctuation; return;
+ // Potentially multi-letter punctuation.
+ //
+ case '.': // . .* .<N> ...
+ {
+ xchar p (peek ());
+
+ if (p == '*')
+ {
+ get (p);
+ t.type = type::punctuation;
+ return;
+ }
+ else if (p >= '0' && p <= '9')
+ {
+ number_literal (t, c);
+ return;
+ }
+ else if (p == '.')
+ {
+ get (p);
+ xchar q (peek ());
+ if (q == '.')
+ {
+ get (q);
+ t.type = type::punctuation;
+ return;
+ }
+ unget (p);
+ // Fall through.
+ }
+
+ t.type = type::dot;
+ return;
+ }
+ case '=': // = ==
+ case '!': // ! !=
+ case '*': // * *=
+ case '/': // / /= (/* and // handled by skip_spaced() above)
+ case '%': // % %=
+ case '^': // ^ ^=
+ {
+ xchar p (peek ());
+
+ if (p == '=')
+ get (p);
+
+ t.type = type::punctuation;
+ return;
+ }
+ case '>': // > >= >> >>=
+ case '<': // < <= << <<=
+ {
+ xchar p (peek ());
+
+ if (p == c)
+ {
+ get (p);
+ if ((p = peek ()) == '=')
+ get (p);
+ }
+ else if (p == '=')
+ get (p);
+
+ t.type = type::punctuation;
+ return;
+ }
+ case '+': // + ++ +=
+ case '-': // - -- -= -> ->*
+ {
+ xchar p (peek ());
+
+ if (p == c)
+ get (p);
+ else if (p == '=')
+ get (p);
+ else if (c == '-' && p == '>')
+ {
+ get (p);
+ if ((p = peek ()) == '*')
+ get (p);
+ }
+
+ t.type = type::punctuation;
+ return;
+ }
+ case '&': // & && &=
+ case '|': // | || |=
+ {
+ xchar p (peek ());
+
+ if (p == c)
+ get (p);
+ else if (p == '=')
+ get (p);
+
+ t.type = type::punctuation;
+ return;
+ }
+ case ':': // : ::
+ {
+ xchar p (peek ());
+
+ if (p == ':')
+ get (p);
+
+ t.type = type::punctuation;
+ return;
+ }
+ // Number (and also .<N> above).
+ //
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ {
+ number_literal (t, c);
+ return;
+ }
+ // Char/string literal, identifier, or other (\, $, @, `).
+ //
+ default:
+ {
+ bool raw (false); // Raw string literal.
+
+ if (alpha (c) || c == '_')
+ {
+ string& id (t.value);
+ id.clear ();
+
+ for (id += c; (c = peek ()) == '_' || alnum (c); get (c))
+ id += c;
+
+ // If the following character is a quote, see if the identifier
+ // is one of the literal prefixes.
+ //
+ if (c == '\'' || c == '\"')
+ {
+ size_t n (id.size ()), i (0);
+ switch (id[0])
+ {
+ case 'u':
+ {
+ if (n > 1 && id[1] == '8')
+ ++i;
+ // Fall through.
+ }
+ case 'L':
+ case 'U':
+ {
+ ++i;
+
+ if (c == '\"' && n > i && id[i] == 'R')
+ {
+ ++i;
+ raw = true;
+ }
+ break;
+ }
+ case 'R':
+ {
+ if (c == '\"')
+ {
+ ++i;
+ raw = true;
+ }
+ break;
+ }
+ }
+
+ if (i == n) // All characters "consumed".
+ {
+ get (c);
+ id.clear ();
+ }
+ }
+
+ if (!id.empty ())
+ {
+ t.type = type::identifier;
+ return;
+ }
+ }
+
+ switch (c)
+ {
+ case '\'':
+ {
+ char_literal (t, c);
+ return;
+ }
+ case '\"':
+ {
+ if (raw)
+ raw_string_literal (t, c);
+ else
+ string_literal (t, c);
+ return;
+ }
+ default:
+ {
+ t.type = type::other;
+ return;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ void lexer::
+ number_literal (token& t, xchar c)
+ {
+ t.line = c.line;
+ t.column = c.column;
+
+ // A number (integer or floating point literal) can:
+ //
+ // 1. Start with a dot (which must be followed by a digit, e.g., .123).
+ //
+ // 2. Can have a radix prefix (0b101, 0123, 0X12AB).
+ //
+ // 3. Can have an exponent (1e10, 0x1.p-10, 1.).
+ //
+ // 4. Digits can be separated with ' (123'456, 0xff00'00ff).
+ //
+ // 5. End with a built-in or user defined literal (123f, 123UL, 123_X)
+ //
+ // Quoting from GCC's preprocessor documentation:
+ //
+ // "Formally preprocessing numbers begin with an optional period, a
+ // required decimal digit, and then continue with any sequence of
+ // letters, digits, underscores, periods, and exponents. Exponents are
+ // the two-character sequences 'e+', 'e-', 'E+', 'E-', 'p+', 'p-', 'P+',
+ // and 'P-'."
+ //
+ // So it looks like a "C++ number" is then any unseparated (with
+ // whitespace or punctuation) sequence of those plus '. The only mildly
+ // tricky part is then to recognize +/- as being part of the exponent.
+ //
+ while (!eos ((c = peek ())))
+ {
+ switch (c)
+ {
+ // All the whitespace, punctuation, and other characters that end
+ // the number.
+ //
+ case ' ':
+ case '\n':
+ case '\t':
+ case '\r':
+ case '\f':
+ case '\v':
+
+ case '#':
+ case ';':
+ case '{':
+ case '}':
+ case '(':
+ case ')':
+ case '[':
+ case ']':
+ case ',':
+ case '?':
+ case '~':
+ case '=':
+ case '!':
+ case '*':
+ case '/':
+ case '%':
+ case '^':
+ case '>':
+ case '<':
+ case '&':
+ case '|':
+ case ':':
+ case '+': // The exponent case is handled below.
+ case '-': // The exponent case is handled below.
+ case '"':
+ case '\\':
+
+ case '@':
+ case '$':
+ case '`':
+ break;
+
+ // Recognize +/- after the exponent.
+ //
+ case 'e':
+ case 'E':
+ case 'p':
+ case 'P':
+ {
+ get (c);
+ c = peek ();
+ if (c == '+' || c == '-')
+ get (c);
+ continue;
+ }
+
+ case '_':
+ case '.':
+ case '\'':
+ default: // Digits and letters.
+ {
+ get (c);
+ continue;
+ }
+ }
+
+ break;
+ }
+
+ t.type = type::number;
+ }
+
+ void lexer::
+ char_literal (token& t, xchar c)
+ {
+ t.line = c.line;
+ t.column = c.column;
+
+ char p (c); // Previous character (see below).
+
+ for (;;)
+ {
+ c = get ();
+
+ if (eos (c))
+ fail (location (&name_, t.line, t.column)) << "unterminated literal";
+
+ if (c == '\'' && p != '\\')
+ break;
+
+ // Keep track of \\-escapings so we don't confuse them with \', as in
+ // '\\'.
+ //
+ p = (c == '\\' && p == '\\') ? '\0' : static_cast<char> (c);
+ }
+
+ // See if we have a user-defined suffix (which is an identifier).
+ //
+ if ((c = peek ()) == '_' || alpha (c))
+ literal_suffix (c);
+
+ t.type = type::character;
+ }
+
+ void lexer::
+ string_literal (token& t, xchar c)
+ {
+ t.line = c.line;
+ t.column = c.column;
+
+ char p (c); // Previous character (see below).
+
+ for (;;)
+ {
+ c = get ();
+
+ if (eos (c))
+ fail (location (&name_, t.line, t.column)) << "unterminated literal";
+
+ if (c == '\"' && p != '\\')
+ break;
+
+ // Keep track of \\-escapings so we don't confuse them with \", as in
+ // "\\".
+ //
+ p = (c == '\\' && p == '\\') ? '\0' : static_cast<char> (c);
+ }
+
+ // See if we have a user-defined suffix (which is an identifier).
+ //
+ if ((c = peek ()) == '_' || alpha (c))
+ literal_suffix (c);
+
+ t.type = type::string;
+ }
+
+ void lexer::
+ raw_string_literal (token& t, xchar c)
+ {
+ t.line = c.line;
+ t.column = c.column;
+
+ // The overall form is:
+ //
+ // R"<delimiter>(<raw_characters>)<delimiter>"
+ //
+ // Where <delimiter> is a potentially-empty character sequence made of
+ // any source character but parentheses, backslash and spaces. It can be
+ // at most 16 characters long.
+ //
+ // Note that the <raw_characters> are not processed in any way, not even
+ // for line continuations.
+ //
+
+ // As a first step, parse the delimiter (including the openning paren).
+ //
+ string d (1, ')');
+
+ for (;;)
+ {
+ c = get ();
+
+ if (eos (c) || c == '\"' || c == ')' || c == '\\' || c == ' ')
+ fail (location (&name_, t.line, t.column)) << "invalid raw literal";
+
+ if (c == '(')
+ break;
+
+ d += c;
+ }
+
+ d += '"';
+
+ // Now parse the raw characters while trying to match the closing
+ // delimiter.
+ //
+ for (size_t i (0);;) // Position to match in d.
+ {
+ c = get (false); // No newline escaping.
+
+ if (eos (c))
+ fail (location (&name_, t.line, t.column)) << "invalid raw literal";
+
+ if (c != d[i] && i != 0) // Restart from the beginning.
+ i = 0;
+
+ if (c == d[i])
+ {
+ if (++i == d.size ())
+ break;
+ }
+ }
+
+ // See if we have a user-defined suffix (which is an identifier).
+ //
+ if ((c = peek ()) == '_' || alpha (c))
+ literal_suffix (c);
+
+ t.type = type::string;
+ }
+
+ void lexer::
+ literal_suffix (xchar c)
+ {
+ // Parse a user-defined literal suffix identifier.
+ //
+ for (get (c); (c = peek ()) == '_' || alnum (c); get (c)) ;
+ }
+
+ auto lexer::
+ skip_spaces (bool nl) -> xchar
+ {
+ xchar c (get ());
+
+ for (; !eos (c); c = get ())
+ {
+ switch (c)
+ {
+ case '\n':
+ {
+ if (!nl)
+ break;
+
+ // Fall through.
+ }
+ case ' ':
+ case '\t':
+ case '\r':
+ case '\f':
+ case '\v': continue;
+
+ case '/':
+ {
+ xchar p (peek ());
+
+ // C++ comment.
+ //
+ if (p == '/')
+ {
+ get (p);
+ do { c = get (); } while (!eos (c) && c != '\n');
+
+ if (!nl)
+ break;
+
+ continue;
+ }
+
+ // C comment.
+ //
+ if (p == '*')
+ {
+ get (p);
+
+ for (;;)
+ {
+ c = get ();
+
+ if (eos (c))
+ fail (p) << "unterminated comment";
+
+ if (c == '*' && (c = peek ()) == '/')
+ {
+ get (c);
+ break;
+ }
+ }
+ continue;
+ }
+ break;
+ }
+ }
+ break;
+ }
+
+ return c;
+ }
+
+ ostream&
+ operator<< (ostream& o, const token& t)
+ {
+ switch (t.type)
+ {
+ case type::dot: o << "'.'"; break;
+ case type::semi: o << "';'"; break;
+ case type::lcbrace: o << "'{'"; break;
+ case type::rcbrace: o << "'}'"; break;
+ case type::punctuation: o << "<punctuation>"; break;
+
+ case type::identifier: o << '\'' << t.value << '\''; break;
+
+ case type::number: o << "<number literal>"; break;
+ case type::character: o << "<char literal>"; break;
+ case type::string: o << "<string literal>"; break;
+
+ case type::other: o << "<other>"; break;
+ case type::eos: o << "<end of file>"; break;
+ }
+
+ return o;
+ }
+ }
+}
diff --git a/build2/cc/lexer.hxx b/build2/cc/lexer.hxx
new file mode 100644
index 0000000..0735b45
--- /dev/null
+++ b/build2/cc/lexer.hxx
@@ -0,0 +1,166 @@
+// file : build2/cc/lexer.hxx -*- C++ -*-
+// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+// license : MIT; see accompanying LICENSE file
+
+#ifndef BUILD2_CC_LEXER_HXX
+#define BUILD2_CC_LEXER_HXX
+
+#include <libbutl/char-scanner.hxx>
+
+#include <build2/types.hxx>
+#include <build2/utility.hxx>
+
+#include <build2/diagnostics.hxx>
+
+namespace build2
+{
+ namespace cc
+ {
+ // Preprocessor-level tokenization of C/C++ source. In other words, the
+ // sequence of tokens returned is similar to what a real C/C++ compiler
+ // would see from its preprocessor.
+ //
+ // The input is a (partially-)preprocessed translation unit that may still
+ // contain comments, line continuations, and preprocessor directives such
+ // as #line, #pragma, etc. Currently all preprocessor directives are
+ // discarded and no values are saved for literals.
+ //
+ enum class token_type
+ {
+ // NOTE: remember to update operator<<() if changing anything here!
+ //
+ eos,
+
+ dot, // .
+ semi, // ;
+ lcbrace, // {
+ rcbrace, // }
+ punctuation, // Other punctuation.
+
+ identifier,
+
+ number, // Number literal.
+ character, // Char literal.
+ string, // String literal.
+
+ other // Other token.
+ };
+
+ struct token
+ {
+ token_type type;
+ string value;
+
+ uint64_t line;
+ uint64_t column;
+
+ public:
+ token ()
+ : token (token_type::eos, 0, 0) {}
+
+ token (token_type t, uint64_t l, uint64_t c)
+ : token (t, string (), l, c) {}
+
+ token (token_type t, string v, uint64_t l, uint64_t c)
+ : type (t), value (move (v)), line (l), column (c) {}
+ };
+
+ // Output the token value in a format suitable for diagnostics.
+ //
+ ostream&
+ operator<< (ostream&, const token&);
+
+ class lexer: protected butl::char_scanner
+ {
+ public:
+ lexer (istream& is, const path& name)
+ : char_scanner (is, false), name_ (name), fail ("error", &name_) {}
+
+ const path&
+ name () const {return name_;}
+
+ // Note that it is ok to call next() again after getting eos.
+ //
+ token
+ next ()
+ {
+ token t;
+ next (t, skip_spaces ());
+ return t;
+ }
+
+ // As above but reuse the token to avoid a (potential) memory
+ // allocation. Typical usage:
+ //
+ // for (token t; l.next (t) != token_type::eos; )
+ // ...
+ //
+ token_type
+ next (token& t)
+ {
+ next (t, skip_spaces ());
+ return t.type;
+ }
+
+ private:
+ void
+ next (token&, xchar);
+
+ void
+ number_literal (token&, xchar);
+
+ void
+ char_literal (token&, xchar);
+
+ void
+ string_literal (token&, xchar);
+
+ void
+ raw_string_literal (token&, xchar);
+
+ void
+ literal_suffix (xchar);
+
+ xchar
+ skip_spaces (bool newline = true);
+
+ // The char_scanner adaptation for newline escape sequence processing.
+ // Enabled by default and is only disabled in the raw string literals.
+ //
+ private:
+ using base = char_scanner;
+
+ xchar
+ get (bool escape = true);
+
+ void
+ get (const xchar& peeked) {base::get (peeked);}
+
+ xchar
+ peek (bool escape = true);
+
+ private:
+ const path name_;
+ fail_mark fail;
+ };
+
+ // Diagnostics plumbing. We assume that any diag stream for which we can
+ // use token as location has its aux data pointing to pointer to path.
+ //
+ inline location
+ get_location (const token& t, const path& p)
+ {
+ return location (&p, t.line, t.column);
+ }
+
+ inline location
+ get_location (const token& t, const void* data)
+ {
+ assert (data != nullptr); // E.g., must be &parser::path_.
+ const path* p (*static_cast<const path* const*> (data));
+ return get_location (t, *p);
+ }
+ }
+}
+
+#endif // BUILD2_CC_LEXER_HXX
diff --git a/unit-tests/cc/lexer/buildfile b/unit-tests/cc/lexer/buildfile
new file mode 100644
index 0000000..ff4e0b3
--- /dev/null
+++ b/unit-tests/cc/lexer/buildfile
@@ -0,0 +1,17 @@
+# file : unit-tests/cc/lexer/buildfile
+# copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+# license : MIT; see accompanying LICENSE file
+
+#@@ Temporary until we get utility library support.
+#
+import libs = libbutl%lib{butl}
+src = cc/lexer token lexer diagnostics utility variable name b-options types-parsers \
+context scope parser target operation rule prerequisite file module function \
+functions-builtin functions-path functions-process-path functions-string \
+functions-target-triplet algorithm search dump filesystem scheduler \
+config/{utility init operation module} spec
+
+exe{driver}: cxx{driver} ../../../build2/cxx{$src} ../../../build2/liba{b} \
+$libs test{*}
+
+include ../../../build2/
diff --git a/unit-tests/cc/lexer/char-literal.test b/unit-tests/cc/lexer/char-literal.test
new file mode 100644
index 0000000..f256785
--- /dev/null
+++ b/unit-tests/cc/lexer/char-literal.test
@@ -0,0 +1,67 @@
+# file : unit-tests/cc/lexer/char-literal.test
+# copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+# license : MIT; see accompanying LICENSE file
+
+# Test character literals.
+#
+
+: normal
+:
+$* <<EOI >>EOO
+'a'
+'aa'
+'"'
+EOI
+<char literal>
+<char literal>
+<char literal>
+EOO
+
+: prefix
+:
+$* <<EOI >>EOO
+L'a'
+U'a'
+u'a'
+u8'a'
+u8R'a'
+EOI
+<char literal>
+<char literal>
+<char literal>
+<char literal>
+'u8R'
+<char literal>
+EOO
+
+: suffix
+:
+$* <<EOI >>EOO
+'a'x
+'a'_X123
+EOI
+<char literal>
+<char literal>
+EOO
+
+: escape
+:
+$* <<EOI >>EOO
+'\''
+'\\'
+'\\\''
+'\n'
+U'\U0001f34c'
+EOI
+<char literal>
+<char literal>
+<char literal>
+<char literal>
+<char literal>
+EOO
+
+: unterminated
+:
+$* <"'a" 2>>EOE != 0
+stdin:1:1: error: unterminated literal
+EOE
diff --git a/unit-tests/cc/lexer/comment.test b/unit-tests/cc/lexer/comment.test
new file mode 100644
index 0000000..e90d8e0
--- /dev/null
+++ b/unit-tests/cc/lexer/comment.test
@@ -0,0 +1,88 @@
+# file : unit-tests/cc/lexer/comment.test
+# copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+# license : MIT; see accompanying LICENSE file
+
+# Test C and C++ comments.
+#
+
+: c-comment
+:
+$* <<EOI
+/* 'one' */
+/* "two" // three
+*/
+/**
+four
+// five */
+/**
+six /*
+*/
+EOI
+
+: cxx-comment
+:
+$* <<EOI
+// 'one'
+// "two" // three
+// four /* five */
+EOI
+
+: commented-out
+:
+$* <<EOI >"';'"
+// /*
+;
+// */
+EOI
+
+: c-unterminated
+:
+$* <<EOI 2>>EOE != 0
+/*
+comment
+EOI
+stdin:1:2: error: unterminated comment
+EOE
+
+: cxx-unterminated
+:
+$* <<:EOI
+// comment
+EOI
+
+: in-char-literal
+:
+$* <<EOI >>EOO
+'//'
+'/*'*/
+EOI
+<char literal>
+<char literal>
+<punctuation>
+<punctuation>
+EOO
+
+: in-string-literal
+:
+$* <<EOI >>EOO
+"//foo"
+"/*"*/
+EOI
+<string literal>
+<string literal>
+<punctuation>
+<punctuation>
+EOO
+
+: in-raw-string-literal
+:
+$* <<EOI >>EOO
+R"X(
+// foo
+/* bar
+)X"*/
+EOI
+<string literal>
+<punctuation>
+<punctuation>
+EOO
diff --git a/unit-tests/cc/lexer/driver.cxx b/unit-tests/cc/lexer/driver.cxx
new file mode 100644
index 0000000..db3f516
--- /dev/null
+++ b/unit-tests/cc/lexer/driver.cxx
@@ -0,0 +1,66 @@
+// file : unit-tests/cc/lexer/driver.cxx -*- C++ -*-
+// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+// license : MIT; see accompanying LICENSE file
+
+#include <cassert>
+#include <iostream>
+
+#include <build2/types.hxx>
+#include <build2/utility.hxx>
+
+#include <build2/cc/lexer.hxx>
+
+using namespace std;
+
+namespace build2
+{
+ namespace cc
+ {
+ // Usage: argv[0] [<file>]
+ //
+ int
+ main (int argc, char* argv[])
+ {
+ try
+ {
+ istream* is;
+ const char* in;
+
+ // Reading from file is several times faster.
+ //
+ ifdstream ifs;
+ if (argc > 1)
+ {
+ in = argv[1];
+ ifs.open (in);
+ is = &ifs;
+ }
+ else
+ {
+ in = "stdin";
+ cin.exceptions (istream::failbit | istream::badbit);
+ is = &cin;
+ }
+
+ lexer l (*is, path (in));
+
+ // No use printing eos since we will either get it or loop forever.
+ //
+ for (token t; l.next (t) != token_type::eos; )
+ cout << t << endl;
+ }
+ catch (const failed&)
+ {
+ return 1;
+ }
+
+ return 0;
+ }
+ }
+}
+
+int
+main (int argc, char* argv[])
+{
+ return build2::cc::main (argc, argv);
+}
diff --git a/unit-tests/cc/lexer/line.test b/unit-tests/cc/lexer/line.test
new file mode 100644
index 0000000..9eda9c3
--- /dev/null
+++ b/unit-tests/cc/lexer/line.test
@@ -0,0 +1,67 @@
+# file : unit-tests/cc/lexer/line.test
+# copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+# license : MIT; see accompanying LICENSE file
+
+# Test line continuations.
+#
+
+: identifier
+:
+$* <<EOI >"'foo123'"
+fo\
+o\
+1\
+2\
+3
+EOI
+
+: punctuation
+:
+$* <<EOI >'<punctuation>'
+.\
+.\
+.
+EOI
+
+: c-comment
+:
+$* <<EOI
+/\
+*
+comment
+*\
+/\
+
+EOI
+
+: cxx-comment
+:
+$* <<EOI
+/\
+/ comment\
+more\
+more
+EOI
+
+: other
+:
+$* <<EOI >>EOO
+\abc
+EOI
+<punctuation>
+'abc'
+EOO
+
+: multiple
+:
+$* <<EOI >>EOO
+\\
+EOI
+<punctuation>
+EOO
+
+: unterminated
+:
+$* <<:EOI >'<punctuation>'
+\
+EOI
diff --git a/unit-tests/cc/lexer/number.test b/unit-tests/cc/lexer/number.test
new file mode 100644
index 0000000..1d9b9c5
--- /dev/null
+++ b/unit-tests/cc/lexer/number.test
@@ -0,0 +1,48 @@
+# file : unit-tests/cc/lexer/number.test
+# copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+# license : MIT; see accompanying LICENSE file
+
+# Test numbers.
+#
+
+$* <'1' >'<number literal>'
+$* <'.1' >'<number literal>'
+$* <'1.' >'<number literal>'
+
+$* <'0b101' >'<number literal>'
+$* <'0123' >'<number literal>'
+$* <'0X12AB' >'<number literal>'
+
+$* <'1e10' >'<number literal>'
+$* <'1E+10' >'<number literal>'
+$* <'0x1.p10' >'<number literal>'
+$* <'0x1.P-10' >'<number literal>'
+
+$* <"123'456" >'<number literal>'
+$* <"0xff00'00ff" >'<number literal>'
+
+$* <'123f' >'<number literal>'
+$* <'123UL' >'<number literal>'
+$* <'123_X' >'<number literal>'
+
+: separate-punctuation
+:
+$* <'123;' >>EOO
+<number literal>
+';'
+EOO
+
+: separate-plus-minus
+:
+$* <'1.0_a+2.0' >>EOO
+<number literal>
+<punctuation>
+<number literal>
+EOO
+
+: separate-whitespace
+:
+$* <'123 abc' >>EOO
+<number literal>
+'abc'
+EOO
diff --git a/unit-tests/cc/lexer/preprocessor.test b/unit-tests/cc/lexer/preprocessor.test
new file mode 100644
index 0000000..2917649
--- /dev/null
+++ b/unit-tests/cc/lexer/preprocessor.test
@@ -0,0 +1,38 @@
+# file : unit-tests/cc/lexer/preprocessor.test
+# copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+# license : MIT; see accompanying LICENSE file
+
+# Test preprocessor lines.
+#
+
+: normal
+:
+$* <<EOI
+#pragma message("abc")
+EOI
+
+: multiline
+:
+$* <<EOI
+#pragma message \
+( \
+"abc" \
+)
+EOI
+
+: comment
+:
+$* <<EOI
+#pragma foo /*
+bar
+baz
+*/
+#pragma foo // bar baz
+EOI
+
+: line
+:
+$* <<EOI
+# 1 "test.cxx" 2
+#line 8 "z:\\tmp\\test.hxx"
+EOI
diff --git a/unit-tests/cc/lexer/raw-string-literal.test b/unit-tests/cc/lexer/raw-string-literal.test
new file mode 100644
index 0000000..e8e8b6b
--- /dev/null
+++ b/unit-tests/cc/lexer/raw-string-literal.test
@@ -0,0 +1,90 @@
+# file : unit-tests/cc/lexer/raw-string-literal.test
+# copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+# license : MIT; see accompanying LICENSE file
+
+# Test raw string literals.
+#
+
+: normal
+:
+$* <<EOI >>EOO
+R"()"
+R"(ab)"
+R"(a"b)"
+R"(a)b)"
+R"%(a%)b)%"
+R"X(a
+ b)X"
+R"X(a\
+ b)X"
+EOI
+<string literal>
+<string literal>
+<string literal>
+<string literal>
+<string literal>
+<string literal>
+<string literal>
+EOO
+
+: prefix
+:
+$* <<EOI >>EOO
+LR"(ab)"
+UR"(ab)"
+uR"(ab)"
+u8R"(ab)"
+EOI
+<string literal>
+<string literal>
+<string literal>
+<string literal>
+EOO
+
+: suffix
+:
+$* <<EOI >>EOO
+R"(ab)"x
+R"(ab)"_X123
+EOI
+<string literal>
+<string literal>
+EOO
+
+: escape
+:
+$* <<EOI >>EOO
+R"(\)"
+EOI
+<string literal>
+EOO
+
+: invalid-no-paren
+:
+$* <'R"a"' 2>>EOE != 0
+stdin:1:2: error: invalid raw literal
+EOE
+
+: invalid-paren
+:
+$* <'R")()("' 2>>EOE != 0
+stdin:1:2: error: invalid raw literal
+EOE
+
+: invalid-unterminated-paren
+:
+$* <'R"(abc"' 2>>EOE != 0
+stdin:1:2: error: invalid raw literal
+EOE
+
+: invalid-unterminated-delimiter
+:
+$* <'R"X(abc)"' 2>>EOE != 0
+stdin:1:2: error: invalid raw literal
+EOE
+
+: invalid-unterminated-quote
+:
+$* <'R"X(abc)X' 2>>EOE != 0
+stdin:1:2: error: invalid raw literal
+EOE
diff --git a/unit-tests/cc/lexer/string-literal.test b/unit-tests/cc/lexer/string-literal.test
new file mode 100644
index 0000000..062d290
--- /dev/null
+++ b/unit-tests/cc/lexer/string-literal.test
@@ -0,0 +1,65 @@
+# file : unit-tests/cc/lexer/string-literal.test
+# copyright : Copyright (c) 2014-2017 Code Synthesis Ltd
+# license : MIT; see accompanying LICENSE file
+
+# Test string literals (except raw).
+#
+
+: normal
+:
+$* <<EOI >>EOO
+"aa"
+"'"
+"a""b"
+EOI
+<string literal>
+<string literal>
+<string literal>
+<string literal>
+EOO
+
+: prefix
+:
+$* <<EOI >>EOO
+L"ab"
+U"ab"
+u"ab"
+u8"ab"
+EOI
+<string literal>
+<string literal>
+<string literal>
+<string literal>
+EOO
+
+: suffix
+:
+$* <<EOI >>EOO
+"ab"x
+"ab"_X123
+EOI
+<string literal>
+<string literal>
+EOO
+
+: escape
+:
+$* <<EOI >>EOO
+"\"\""
+"\\\\"
+"\\\"\\"
+"\n\t"
+U"a\U0001f34c"
+EOI
+<string literal>
+<string literal>
+<string literal>
+<string literal>
+<string literal>
+EOO
+
+: unterminated
+:
+$* <'"ab' 2>>EOE != 0
+stdin:1:1: error: unterminated literal
+EOE