Handle #line directives in C/C++ lexer

This way the parser now reports logical rather than physical location in diagnostics.
author: Boris Kolpackov <boris@codesynthesis.com> 2017-05-27 15:24:25 +0200
committer: Boris Kolpackov <boris@codesynthesis.com> 2017-05-27 15:24:25 +0200
commit: 2e19434e09b819105055ddc8e58f69db98ec8669 (patch)
tree: e806e15f6e940a9135f0e7d8cf9ba08637512bd8
parent: de417f02b2b1f3a02c5c9d206f399c574a93bf7f (diff)
9 files changed, 227 insertions, 73 deletions
diff --git a/build2/cc/lexer.cxx b/build2/cc/lexer.cxx
index 05c734c..40178bb 100644
--- a/build2/cc/lexer.cxx
+++ b/build2/cc/lexer.cxx
@@ -25,6 +25,18 @@ namespace build2
 {
   namespace cc
   {
+    inline void lexer::
+    get (const xchar& c)
+    {
+      // Increment the logical line similar to how base will increment the
+      // physical (the column counts are the same).
+      //
+      if (log_line_ && c == '\n' && !unget_ && !unpeek_)
+        ++*log_line_;
+
+      base::get (c);
+    }
+
     inline auto lexer::
     get (bool e) -> xchar
     {
@@ -36,7 +48,7 @@ namespace build2
       else
       {
         xchar c (peek (e));
-        base::get (c);
+        get (c);
         return c;
       }
     }
@@ -54,12 +66,12 @@ namespace build2
 
       if (e && c == '\\')
       {
-        base::get (c);
+        get (c);
         xchar p (base::peek ());
 
         if (p == '\n')
         {
-          base::get (p);
+          get (p);
           return peek (e); // Recurse.
         }
 
@@ -80,7 +92,8 @@ namespace build2
     {
       for (;; c = skip_spaces ())
       {
-        t.line = c.line;
+        t.file = log_file_;
+        t.line = log_line_ ? * log_line_ : c.line;
         t.column = c.column;
 
         if (eos (c))
@@ -101,19 +114,52 @@ namespace build2
             // that we assume there cannot be #include directives.
             //
             // This may not work for things like #error that can contain
-            // pretty much anything. Also note that lines that start with
-            // # can contain # further down.
+            // pretty much anything. Also note that lines that start with #
+            // can contain # further down.
+            //
+            // Finally, to support diagnostics properly we need to recognize
+            // #line directives.
             //
             if (ignore_pp)
             {
-              for (;;)
+              for (bool first (true);;)
               {
+                // Note that we keep using the passed token for buffers.
+                //
                 c = skip_spaces (false); // Stop at newline.
 
                 if (eos (c) || c == '\n')
                   break;
 
-                next (t, c, false); // Keep using the passed token for buffers.
+                if (first)
+                {
+                  first = false;
+
+                  // Recognize #line and its shorthand version:
+                  //
+                  // #line <integer> [<string literal>] ...
+                  // #     <integer> [<string literal>] ...
+                  //
+                  if (!(c >= '0' && c <= '9'))
+                  {
+                    next (t, c, false);
+
+                    if (t.type != type::identifier || t.value != "line")
+                      continue;
+
+                    c = skip_spaces (false);
+
+                    if (!(c >= '0' && c <= '9'))
+                      fail (c) << "line number expected after #line directive";
+                  }
+
+                  // Ok, this is #line and next comes the line number.
+                  //
+                  line_directive (t, c);
+                  continue; // Parse the tail, if any.
+                }
+
+                next (t, c, false);
               }
               break;
             }
@@ -356,9 +402,6 @@ namespace build2
     void lexer::
     number_literal (token& t, xchar c)
     {
-      t.line = c.line;
-      t.column = c.column;
-
       // A number (integer or floating point literal) can:
       //
       // 1. Start with a dot (which must be followed by a digit, e.g., .123).
@@ -462,17 +505,15 @@ namespace build2
     void lexer::
     char_literal (token& t, xchar c)
     {
-      t.line = c.line;
-      t.column = c.column;
+      uint64_t ln (c.line);
+      uint64_t cn (c.column);
 
-      char p (c); // Previous character (see below).
-
-      for (;;)
+      for (char p (c);;) // Previous character (see below).
       {
         c = get ();
 
-        if (eos (c))
-          fail (location (&name_, t.line, t.column)) << "unterminated literal";
+        if (eos (c) || c == '\n')
+          fail (location (&name_, ln, cn)) << "unterminated character literal";
 
         if (c == '\'' && p != '\\')
           break;
@@ -494,17 +535,15 @@ namespace build2
     void lexer::
     string_literal (token& t, xchar c)
     {
-      t.line = c.line;
-      t.column = c.column;
-
-      char p (c); // Previous character (see below).
+      uint64_t ln (c.line);
+      uint64_t cn (c.column);
 
-      for (;;)
+      for (char p (c);;) // Previous character (see below).
       {
         c = get ();
 
-        if (eos (c))
-          fail (location (&name_, t.line, t.column)) << "unterminated literal";
+        if (eos (c) || c == '\n')
+          fail (location (&name_, ln, cn)) << "unterminated string literal";
 
         if (c == '\"' && p != '\\')
           break;
@@ -526,9 +565,6 @@ namespace build2
     void lexer::
     raw_string_literal (token& t, xchar c)
     {
-      t.line = c.line;
-      t.column = c.column;
-
       // The overall form is:
       //
       // R"<delimiter>(<raw_characters>)<delimiter>"
@@ -540,6 +576,8 @@ namespace build2
       // Note that the <raw_characters> are not processed in any way, not even
       // for line continuations.
       //
+      uint64_t ln (c.line);
+      uint64_t cn (c.column);
 
       // As a first step, parse the delimiter (including the openning paren).
       //
@@ -550,7 +588,7 @@ namespace build2
         c = get ();
 
         if (eos (c) || c == '\"' || c == ')' || c == '\\' || c == ' ')
-          fail (location (&name_, t.line, t.column)) << "invalid raw literal";
+          fail (location (&name_, ln, cn)) << "invalid raw string literal";
 
         if (c == '(')
           break;
@@ -567,8 +605,8 @@ namespace build2
       {
         c = get (false); // No newline escaping.
 
-        if (eos (c))
-          fail (location (&name_, t.line, t.column)) << "invalid raw literal";
+        if (eos (c)) // Note: newline is ok.
+          fail (location (&name_, ln, cn)) << "invalid raw string literal";
 
         if (c != d[i] && i != 0) // Restart from the beginning.
           i = 0;
@@ -596,6 +634,86 @@ namespace build2
       for (get (c); (c = peek ()) == '_' || alnum (c); get (c)) ;
     }
 
+    void lexer::
+    line_directive (token& t, xchar c)
+    {
+      // enter: first digit of the line number
+      // leave: last character of the line number or file string
+
+      // If our number and string tokens contained the literal values, then we
+      // could have used that. However, we ignore the value (along with escape
+      // processing, etc), for performance. Let's keep it that way and instead
+      // handle it ourselves.
+      //
+      {
+        string& s (t.value);
+
+        for (s = c; (c = peek ()) >= '0' && c <= '9'; get (c))
+          s += c;
+
+        // The newline that ends the directive will increment the logical line
+        // so subtract one to compensate. Note: can't be 0 and shouldn't throw
+        // for valid lines.
+        //
+        log_line_ = stoull (s.c_str ()) - 1;
+      }
+
+      // See if we have the file.
+      //
+      c = skip_spaces (false);
+
+      if (c == '\"')
+      {
+        string s (move (log_file_).string ()); // Move string rep out.
+        s.clear ();
+
+        uint64_t ln (c.line);
+        uint64_t cn (c.column);
+
+        for (char p ('\0'); p != '\"'; ) // Previous character.
+        {
+          c = get ();
+
+          if (eos (c) || c == '\n')
+            fail (location (&name_, ln, cn)) << "unterminated string literal";
+
+          // Handle escapes.
+          //
+          if (p == '\\')
+          {
+            p = '\0'; // Clear so we don't confuse \" and \\".
+
+            // We only handle what can reasonably be expected in a file name.
+            //
+            switch (c)
+            {
+            case '\\':
+            case '\'':
+            case '\"': break; // Add as is.
+            default:
+              fail (c) << "unsupported escape sequence in #line directive";
+            }
+          }
+          else
+          {
+            p = c;
+
+            switch (c)
+            {
+            case '\\':
+            case '\"': continue;
+            }
+          }
+
+          s += c;
+        }
+
+        log_file_ = path (move (s)); // Move back in.
+      }
+      else
+        unget (c);
+    }
+
     auto lexer::
     skip_spaces (bool nl) -> xchar
     {
diff --git a/build2/cc/lexer.hxx b/build2/cc/lexer.hxx
index 7865a4e..8767606 100644
--- a/build2/cc/lexer.hxx
+++ b/build2/cc/lexer.hxx
@@ -22,8 +22,10 @@ namespace build2
     //
     // The input is a (partially-)preprocessed translation unit that may still
     // contain comments, line continuations, and preprocessor directives such
-    // as #line, #pragma, etc. Currently all preprocessor directives are
-    // discarded and no values are saved for literals.
+    // as #line, #pragma, etc., but not #include's. Currently all preprocessor
+    // directives except #line are ignored and no values are saved from
+    // literals. The #line directive (and its shorthand notation) is
+    // recognized to provide the logical token location.
     //
     enum class token_type
     {
@@ -51,6 +53,7 @@ namespace build2
       token_type type;
       string     value;
 
+      path     file;
       uint64_t line;
       uint64_t column;
 
@@ -74,7 +77,10 @@ namespace build2
     {
     public:
       lexer (istream& is, const path& name)
-          : char_scanner (is, false), name_ (name), fail ("error", &name_) {}
+          : char_scanner (is, false),
+            name_ (name),
+            fail ("error", &name_),
+            log_file_ (name) {}
 
       const path&
       name () const {return name_;}
@@ -121,6 +127,9 @@ namespace build2
       void
       literal_suffix (xchar);
 
+      void
+      line_directive (token&, xchar);
+
       xchar
       skip_spaces (bool newline = true);
 
@@ -134,7 +143,7 @@ namespace build2
       get (bool escape = true);
 
       void
-      get (const xchar& peeked) {base::get (peeked);}
+      get (const xchar& peeked);
 
       xchar
       peek (bool escape = true);
@@ -142,23 +151,20 @@ namespace build2
     private:
       const path name_;
       const fail_mark fail;
+
+      // Logical file and line as set by the #line directives. Note that the
+      // lexer diagnostics still uses the physical file/lines.
+      //
+      path               log_file_;
+      optional<uint64_t> log_line_;
     };
 
-    // Diagnostics plumbing. We assume that any diag stream for which we can
-    // use token as location has its aux data pointing to pointer to path.
+    // Diagnostics plumbing.
     //
     inline location
-    get_location (const token& t, const path& p)
-    {
-      return location (&p, t.line, t.column);
-    }
-
-    inline location
-    get_location (const token& t, const void* data)
+    get_location (const token& t, const void*)
     {
-      assert (data != nullptr); // E.g., must be &parser::path_.
-      const path* p (*static_cast<const path* const*> (data));
-      return get_location (t, *p);
+      return location (&t.file, t.line, t.column);
     }
   }
 }
diff --git a/build2/cc/parser.cxx b/build2/cc/parser.cxx
index b21e99f..24de7ba 100644
--- a/build2/cc/parser.cxx
+++ b/build2/cc/parser.cxx
@@ -18,9 +18,7 @@ namespace build2
     translation_unit parser::
     parse (istream& is, const path& name)
     {
-      name_ = &name;
-
-      lexer l (is, *name_);
+      lexer l (is, name);
       l_ = &l;
 
       translation_unit u;
diff --git a/build2/cc/parser.hxx b/build2/cc/parser.hxx
index d52ddc9..00be190 100644
--- a/build2/cc/parser.hxx
+++ b/build2/cc/parser.hxx
@@ -30,8 +30,6 @@ namespace build2
     class parser
     {
     public:
-      parser (): fail ("error", &name_), warn ("warning", &name_) {}
-
       translation_unit
       parse (istream&, const path& name);
 
@@ -46,11 +44,6 @@ namespace build2
       parse_module_name (token&);
 
     private:
-      const path* name_;
-
-      const fail_mark  fail;
-      const basic_mark warn;
-
       lexer* l_;
       translation_unit* u_;
     };
diff --git a/unit-tests/cc/lexer/char-literal.test b/unit-tests/cc/lexer/char-literal.test
index f256785..f2c6249 100644
--- a/unit-tests/cc/lexer/char-literal.test
+++ b/unit-tests/cc/lexer/char-literal.test
@@ -63,5 +63,5 @@ EOO
 : unterminated
 :
 $* <"'a" 2>>EOE != 0
-stdin:1:1: error: unterminated literal
+stdin:1:1: error: unterminated character literal
 EOE
diff --git a/unit-tests/cc/lexer/driver.cxx b/unit-tests/cc/lexer/driver.cxx
index db3f516..5803a88 100644
--- a/unit-tests/cc/lexer/driver.cxx
+++ b/unit-tests/cc/lexer/driver.cxx
@@ -16,38 +16,59 @@ namespace build2
 {
   namespace cc
   {
-    // Usage: argv[0] [<file>]
+    // Usage: argv[0] [-l] [<file>]
     //
     int
     main (int argc, char* argv[])
     {
+      bool loc (false);
+      const char* file (nullptr);
+
+      for (int i (1); i != argc; ++i)
+      {
+        string a (argv[i]);
+
+        if (a == "-l")
+          loc = true;
+        else
+        {
+          file = argv[i];
+          break;
+        }
+      }
+
       try
       {
         istream* is;
-        const char* in;
 
         // Reading from file is several times faster.
         //
         ifdstream ifs;
-        if (argc > 1)
+        if (file != nullptr)
         {
-          in = argv[1];
-          ifs.open (in);
+          ifs.open (file);
           is = &ifs;
         }
         else
         {
-          in = "stdin";
+          file = "stdin";
           cin.exceptions (istream::failbit | istream::badbit);
           is = &cin;
         }
 
-        lexer l (*is, path (in));
+        lexer l (*is, path (file));
 
         // No use printing eos since we will either get it or loop forever.
         //
         for (token t; l.next (t) != token_type::eos; )
-          cout << t << endl;
+        {
+          cout << t;
+
+          if (loc)
+            cout << ' ' << t.file << ':' << t.line << ':' << t.column;
+
+          cout << endl;
+        }
       }
       catch (const failed&)
       {
diff --git a/unit-tests/cc/lexer/preprocessor.test b/unit-tests/cc/lexer/preprocessor.test
index e082062..a3fab9f 100644
--- a/unit-tests/cc/lexer/preprocessor.test
+++ b/unit-tests/cc/lexer/preprocessor.test
@@ -32,10 +32,28 @@ EOI
 
 : line
 :
-$* <<EOI
+$* -l <<EOI >>EOO
+;
 # 1 "test.cxx" 2
+;
+  ;
+# 4
+;
 #line 8 "z:\\tmp\\test.hxx"
+;
+#line 10
+;
+# 5 "test.cxx"
+;
 EOI
+';' stdin:1:1
+';' test.cxx:1:1
+';' test.cxx:2:3
+';' test.cxx:4:1
+';' z:\tmp\test.hxx:8:1
+';' z:\tmp\test.hxx:10:1
+';' test.cxx:5:1
+EOO
 
 : nested
 :
diff --git a/unit-tests/cc/lexer/raw-string-literal.test b/unit-tests/cc/lexer/raw-string-literal.test
index e8e8b6b..7d5b920 100644
--- a/unit-tests/cc/lexer/raw-string-literal.test
+++ b/unit-tests/cc/lexer/raw-string-literal.test
@@ -62,29 +62,29 @@ EOO
 : invalid-no-paren
 :
 $* <'R"a"' 2>>EOE != 0
-stdin:1:2: error: invalid raw literal
+stdin:1:2: error: invalid raw string literal
 EOE
 
 : invalid-paren
 :
 $* <'R")()("' 2>>EOE != 0
-stdin:1:2: error: invalid raw literal
+stdin:1:2: error: invalid raw string literal
 EOE
 
 : invalid-unterminated-paren
 :
 $* <'R"(abc"' 2>>EOE != 0
-stdin:1:2: error: invalid raw literal
+stdin:1:2: error: invalid raw string literal
 EOE
 
 : invalid-unterminated-delimiter
 :
 $* <'R"X(abc)"' 2>>EOE != 0
-stdin:1:2: error: invalid raw literal
+stdin:1:2: error: invalid raw string literal
 EOE
 
 : invalid-unterminated-quote
 :
 $* <'R"X(abc)X' 2>>EOE != 0
-stdin:1:2: error: invalid raw literal
+stdin:1:2: error: invalid raw string literal
 EOE
diff --git a/unit-tests/cc/lexer/string-literal.test b/unit-tests/cc/lexer/string-literal.test
index 062d290..f726c76 100644
--- a/unit-tests/cc/lexer/string-literal.test
+++ b/unit-tests/cc/lexer/string-literal.test
@@ -61,5 +61,5 @@ EOO
 : unterminated
 :
 $* <'"ab' 2>>EOE != 0
-stdin:1:1: error: unterminated literal
+stdin:1:1: error: unterminated string literal
 EOE
author	Boris Kolpackov <boris@codesynthesis.com>	2017-05-27 15:24:25 +0200
committer	Boris Kolpackov <boris@codesynthesis.com>	2017-05-27 15:24:25 +0200
commit	2e19434e09b819105055ddc8e58f69db98ec8669 (patch)
tree	e806e15f6e940a9135f0e7d8cf9ba08637512bd8
parent	de417f02b2b1f3a02c5c9d206f399c574a93bf7f (diff)