aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBoris Kolpackov <boris@codesynthesis.com>2017-05-27 15:24:25 +0200
committerBoris Kolpackov <boris@codesynthesis.com>2017-05-27 15:24:25 +0200
commit2e19434e09b819105055ddc8e58f69db98ec8669 (patch)
treee806e15f6e940a9135f0e7d8cf9ba08637512bd8
parentde417f02b2b1f3a02c5c9d206f399c574a93bf7f (diff)
Handle #line directives in C/C++ lexer
This way the parser now reports logical rather than physical location in diagnostics.
-rw-r--r--build2/cc/lexer.cxx180
-rw-r--r--build2/cc/lexer.hxx38
-rw-r--r--build2/cc/parser.cxx4
-rw-r--r--build2/cc/parser.hxx7
-rw-r--r--unit-tests/cc/lexer/char-literal.test2
-rw-r--r--unit-tests/cc/lexer/driver.cxx37
-rw-r--r--unit-tests/cc/lexer/preprocessor.test20
-rw-r--r--unit-tests/cc/lexer/raw-string-literal.test10
-rw-r--r--unit-tests/cc/lexer/string-literal.test2
9 files changed, 227 insertions, 73 deletions
diff --git a/build2/cc/lexer.cxx b/build2/cc/lexer.cxx
index 05c734c..40178bb 100644
--- a/build2/cc/lexer.cxx
+++ b/build2/cc/lexer.cxx
@@ -25,6 +25,18 @@ namespace build2
{
namespace cc
{
+ inline void lexer::
+ get (const xchar& c)
+ {
+ // Increment the logical line similar to how base will increment the
+ // physical (the column counts are the same).
+ //
+ if (log_line_ && c == '\n' && !unget_ && !unpeek_)
+ ++*log_line_;
+
+ base::get (c);
+ }
+
inline auto lexer::
get (bool e) -> xchar
{
@@ -36,7 +48,7 @@ namespace build2
else
{
xchar c (peek (e));
- base::get (c);
+ get (c);
return c;
}
}
@@ -54,12 +66,12 @@ namespace build2
if (e && c == '\\')
{
- base::get (c);
+ get (c);
xchar p (base::peek ());
if (p == '\n')
{
- base::get (p);
+ get (p);
return peek (e); // Recurse.
}
@@ -80,7 +92,8 @@ namespace build2
{
for (;; c = skip_spaces ())
{
- t.line = c.line;
+ t.file = log_file_;
+ t.line = log_line_ ? * log_line_ : c.line;
t.column = c.column;
if (eos (c))
@@ -101,19 +114,52 @@ namespace build2
// that we assume there cannot be #include directives.
//
// This may not work for things like #error that can contain
- // pretty much anything. Also note that lines that start with
- // # can contain # further down.
+ // pretty much anything. Also note that lines that start with #
+ // can contain # further down.
+ //
+ // Finally, to support diagnostics properly we need to recognize
+ // #line directives.
//
if (ignore_pp)
{
- for (;;)
+ for (bool first (true);;)
{
+ // Note that we keep using the passed token for buffers.
+ //
c = skip_spaces (false); // Stop at newline.
if (eos (c) || c == '\n')
break;
- next (t, c, false); // Keep using the passed token for buffers.
+ if (first)
+ {
+ first = false;
+
+ // Recognize #line and its shorthand version:
+ //
+ // #line <integer> [<string literal>] ...
+ // # <integer> [<string literal>] ...
+ //
+ if (!(c >= '0' && c <= '9'))
+ {
+ next (t, c, false);
+
+ if (t.type != type::identifier || t.value != "line")
+ continue;
+
+ c = skip_spaces (false);
+
+ if (!(c >= '0' && c <= '9'))
+ fail (c) << "line number expected after #line directive";
+ }
+
+ // Ok, this is #line and next comes the line number.
+ //
+ line_directive (t, c);
+ continue; // Parse the tail, if any.
+ }
+
+ next (t, c, false);
}
break;
}
@@ -356,9 +402,6 @@ namespace build2
void lexer::
number_literal (token& t, xchar c)
{
- t.line = c.line;
- t.column = c.column;
-
// A number (integer or floating point literal) can:
//
// 1. Start with a dot (which must be followed by a digit, e.g., .123).
@@ -462,17 +505,15 @@ namespace build2
void lexer::
char_literal (token& t, xchar c)
{
- t.line = c.line;
- t.column = c.column;
+ uint64_t ln (c.line);
+ uint64_t cn (c.column);
- char p (c); // Previous character (see below).
-
- for (;;)
+ for (char p (c);;) // Previous character (see below).
{
c = get ();
- if (eos (c))
- fail (location (&name_, t.line, t.column)) << "unterminated literal";
+ if (eos (c) || c == '\n')
+ fail (location (&name_, ln, cn)) << "unterminated character literal";
if (c == '\'' && p != '\\')
break;
@@ -494,17 +535,15 @@ namespace build2
void lexer::
string_literal (token& t, xchar c)
{
- t.line = c.line;
- t.column = c.column;
-
- char p (c); // Previous character (see below).
+ uint64_t ln (c.line);
+ uint64_t cn (c.column);
- for (;;)
+ for (char p (c);;) // Previous character (see below).
{
c = get ();
- if (eos (c))
- fail (location (&name_, t.line, t.column)) << "unterminated literal";
+ if (eos (c) || c == '\n')
+ fail (location (&name_, ln, cn)) << "unterminated string literal";
if (c == '\"' && p != '\\')
break;
@@ -526,9 +565,6 @@ namespace build2
void lexer::
raw_string_literal (token& t, xchar c)
{
- t.line = c.line;
- t.column = c.column;
-
// The overall form is:
//
// R"<delimiter>(<raw_characters>)<delimiter>"
@@ -540,6 +576,8 @@ namespace build2
// Note that the <raw_characters> are not processed in any way, not even
// for line continuations.
//
+ uint64_t ln (c.line);
+ uint64_t cn (c.column);
// As a first step, parse the delimiter (including the openning paren).
//
@@ -550,7 +588,7 @@ namespace build2
c = get ();
if (eos (c) || c == '\"' || c == ')' || c == '\\' || c == ' ')
- fail (location (&name_, t.line, t.column)) << "invalid raw literal";
+ fail (location (&name_, ln, cn)) << "invalid raw string literal";
if (c == '(')
break;
@@ -567,8 +605,8 @@ namespace build2
{
c = get (false); // No newline escaping.
- if (eos (c))
- fail (location (&name_, t.line, t.column)) << "invalid raw literal";
+ if (eos (c)) // Note: newline is ok.
+ fail (location (&name_, ln, cn)) << "invalid raw string literal";
if (c != d[i] && i != 0) // Restart from the beginning.
i = 0;
@@ -596,6 +634,86 @@ namespace build2
for (get (c); (c = peek ()) == '_' || alnum (c); get (c)) ;
}
+ void lexer::
+ line_directive (token& t, xchar c)
+ {
+ // enter: first digit of the line number
+ // leave: last character of the line number or file string
+
+ // If our number and string tokens contained the literal values, then we
+ // could have used that. However, we ignore the value (along with escape
+ // processing, etc), for performance. Let's keep it that way and instead
+ // handle it ourselves.
+ //
+ {
+ string& s (t.value);
+
+ for (s = c; (c = peek ()) >= '0' && c <= '9'; get (c))
+ s += c;
+
+ // The newline that ends the directive will increment the logical line
+ // so subtract one to compensate. Note: can't be 0 and shouldn't throw
+ // for valid lines.
+ //
+ log_line_ = stoull (s.c_str ()) - 1;
+ }
+
+ // See if we have the file.
+ //
+ c = skip_spaces (false);
+
+ if (c == '\"')
+ {
+ string s (move (log_file_).string ()); // Move string rep out.
+ s.clear ();
+
+ uint64_t ln (c.line);
+ uint64_t cn (c.column);
+
+ for (char p ('\0'); p != '\"'; ) // Previous character.
+ {
+ c = get ();
+
+ if (eos (c) || c == '\n')
+ fail (location (&name_, ln, cn)) << "unterminated string literal";
+
+ // Handle escapes.
+ //
+ if (p == '\\')
+ {
+ p = '\0'; // Clear so we don't confuse \" and \\".
+
+ // We only handle what can reasonably be expected in a file name.
+ //
+ switch (c)
+ {
+ case '\\':
+ case '\'':
+ case '\"': break; // Add as is.
+ default:
+ fail (c) << "unsupported escape sequence in #line directive";
+ }
+ }
+ else
+ {
+ p = c;
+
+ switch (c)
+ {
+ case '\\':
+ case '\"': continue;
+ }
+ }
+
+ s += c;
+ }
+
+ log_file_ = path (move (s)); // Move back in.
+ }
+ else
+ unget (c);
+ }
+
auto lexer::
skip_spaces (bool nl) -> xchar
{
diff --git a/build2/cc/lexer.hxx b/build2/cc/lexer.hxx
index 7865a4e..8767606 100644
--- a/build2/cc/lexer.hxx
+++ b/build2/cc/lexer.hxx
@@ -22,8 +22,10 @@ namespace build2
//
// The input is a (partially-)preprocessed translation unit that may still
// contain comments, line continuations, and preprocessor directives such
- // as #line, #pragma, etc. Currently all preprocessor directives are
- // discarded and no values are saved for literals.
+ // as #line, #pragma, etc., but not #include's. Currently all preprocessor
+ // directives except #line are ignored and no values are saved from
+ // literals. The #line directive (and its shorthand notation) is
+ // recognized to provide the logical token location.
//
enum class token_type
{
@@ -51,6 +53,7 @@ namespace build2
token_type type;
string value;
+ path file;
uint64_t line;
uint64_t column;
@@ -74,7 +77,10 @@ namespace build2
{
public:
lexer (istream& is, const path& name)
- : char_scanner (is, false), name_ (name), fail ("error", &name_) {}
+ : char_scanner (is, false),
+ name_ (name),
+ fail ("error", &name_),
+ log_file_ (name) {}
const path&
name () const {return name_;}
@@ -121,6 +127,9 @@ namespace build2
void
literal_suffix (xchar);
+ void
+ line_directive (token&, xchar);
+
xchar
skip_spaces (bool newline = true);
@@ -134,7 +143,7 @@ namespace build2
get (bool escape = true);
void
- get (const xchar& peeked) {base::get (peeked);}
+ get (const xchar& peeked);
xchar
peek (bool escape = true);
@@ -142,23 +151,20 @@ namespace build2
private:
const path name_;
const fail_mark fail;
+
+ // Logical file and line as set by the #line directives. Note that the
+ // lexer diagnostics still uses the physical file/lines.
+ //
+ path log_file_;
+ optional<uint64_t> log_line_;
};
- // Diagnostics plumbing. We assume that any diag stream for which we can
- // use token as location has its aux data pointing to pointer to path.
+ // Diagnostics plumbing.
//
inline location
- get_location (const token& t, const path& p)
- {
- return location (&p, t.line, t.column);
- }
-
- inline location
- get_location (const token& t, const void* data)
+ get_location (const token& t, const void*)
{
- assert (data != nullptr); // E.g., must be &parser::path_.
- const path* p (*static_cast<const path* const*> (data));
- return get_location (t, *p);
+ return location (&t.file, t.line, t.column);
}
}
}
diff --git a/build2/cc/parser.cxx b/build2/cc/parser.cxx
index b21e99f..24de7ba 100644
--- a/build2/cc/parser.cxx
+++ b/build2/cc/parser.cxx
@@ -18,9 +18,7 @@ namespace build2
translation_unit parser::
parse (istream& is, const path& name)
{
- name_ = &name;
-
- lexer l (is, *name_);
+ lexer l (is, name);
l_ = &l;
translation_unit u;
diff --git a/build2/cc/parser.hxx b/build2/cc/parser.hxx
index d52ddc9..00be190 100644
--- a/build2/cc/parser.hxx
+++ b/build2/cc/parser.hxx
@@ -30,8 +30,6 @@ namespace build2
class parser
{
public:
- parser (): fail ("error", &name_), warn ("warning", &name_) {}
-
translation_unit
parse (istream&, const path& name);
@@ -46,11 +44,6 @@ namespace build2
parse_module_name (token&);
private:
- const path* name_;
-
- const fail_mark fail;
- const basic_mark warn;
-
lexer* l_;
translation_unit* u_;
};
diff --git a/unit-tests/cc/lexer/char-literal.test b/unit-tests/cc/lexer/char-literal.test
index f256785..f2c6249 100644
--- a/unit-tests/cc/lexer/char-literal.test
+++ b/unit-tests/cc/lexer/char-literal.test
@@ -63,5 +63,5 @@ EOO
: unterminated
:
$* <"'a" 2>>EOE != 0
-stdin:1:1: error: unterminated literal
+stdin:1:1: error: unterminated character literal
EOE
diff --git a/unit-tests/cc/lexer/driver.cxx b/unit-tests/cc/lexer/driver.cxx
index db3f516..5803a88 100644
--- a/unit-tests/cc/lexer/driver.cxx
+++ b/unit-tests/cc/lexer/driver.cxx
@@ -16,38 +16,59 @@ namespace build2
{
namespace cc
{
- // Usage: argv[0] [<file>]
+ // Usage: argv[0] [-l] [<file>]
//
int
main (int argc, char* argv[])
{
+ bool loc (false);
+ const char* file (nullptr);
+
+ for (int i (1); i != argc; ++i)
+ {
+ string a (argv[i]);
+
+ if (a == "-l")
+ loc = true;
+ else
+ {
+ file = argv[i];
+ break;
+ }
+ }
+
try
{
istream* is;
- const char* in;
// Reading from file is several times faster.
//
ifdstream ifs;
- if (argc > 1)
+ if (file != nullptr)
{
- in = argv[1];
- ifs.open (in);
+ ifs.open (file);
is = &ifs;
}
else
{
- in = "stdin";
+ file = "stdin";
cin.exceptions (istream::failbit | istream::badbit);
is = &cin;
}
- lexer l (*is, path (in));
+ lexer l (*is, path (file));
// No use printing eos since we will either get it or loop forever.
//
for (token t; l.next (t) != token_type::eos; )
- cout << t << endl;
+ {
+ cout << t;
+
+ if (loc)
+ cout << ' ' << t.file << ':' << t.line << ':' << t.column;
+
+ cout << endl;
+ }
}
catch (const failed&)
{
diff --git a/unit-tests/cc/lexer/preprocessor.test b/unit-tests/cc/lexer/preprocessor.test
index e082062..a3fab9f 100644
--- a/unit-tests/cc/lexer/preprocessor.test
+++ b/unit-tests/cc/lexer/preprocessor.test
@@ -32,10 +32,28 @@ EOI
: line
:
-$* <<EOI
+$* -l <<EOI >>EOO
+;
# 1 "test.cxx" 2
+;
+ ;
+# 4
+;
#line 8 "z:\\tmp\\test.hxx"
+;
+#line 10
+;
+# 5 "test.cxx"
+;
EOI
+';' stdin:1:1
+';' test.cxx:1:1
+';' test.cxx:2:3
+';' test.cxx:4:1
+';' z:\tmp\test.hxx:8:1
+';' z:\tmp\test.hxx:10:1
+';' test.cxx:5:1
+EOO
: nested
:
diff --git a/unit-tests/cc/lexer/raw-string-literal.test b/unit-tests/cc/lexer/raw-string-literal.test
index e8e8b6b..7d5b920 100644
--- a/unit-tests/cc/lexer/raw-string-literal.test
+++ b/unit-tests/cc/lexer/raw-string-literal.test
@@ -62,29 +62,29 @@ EOO
: invalid-no-paren
:
$* <'R"a"' 2>>EOE != 0
-stdin:1:2: error: invalid raw literal
+stdin:1:2: error: invalid raw string literal
EOE
: invalid-paren
:
$* <'R")()("' 2>>EOE != 0
-stdin:1:2: error: invalid raw literal
+stdin:1:2: error: invalid raw string literal
EOE
: invalid-unterminated-paren
:
$* <'R"(abc"' 2>>EOE != 0
-stdin:1:2: error: invalid raw literal
+stdin:1:2: error: invalid raw string literal
EOE
: invalid-unterminated-delimiter
:
$* <'R"X(abc)"' 2>>EOE != 0
-stdin:1:2: error: invalid raw literal
+stdin:1:2: error: invalid raw string literal
EOE
: invalid-unterminated-quote
:
$* <'R"X(abc)X' 2>>EOE != 0
-stdin:1:2: error: invalid raw literal
+stdin:1:2: error: invalid raw string literal
EOE
diff --git a/unit-tests/cc/lexer/string-literal.test b/unit-tests/cc/lexer/string-literal.test
index 062d290..f726c76 100644
--- a/unit-tests/cc/lexer/string-literal.test
+++ b/unit-tests/cc/lexer/string-literal.test
@@ -61,5 +61,5 @@ EOO
: unterminated
:
$* <'"ab' 2>>EOE != 0
-stdin:1:1: error: unterminated literal
+stdin:1:1: error: unterminated string literal
EOE