Distinguish token quoting type and completeness

author: Boris Kolpackov <boris@codesynthesis.com> 2016-11-25 11:18:34 +0200
committer: Boris Kolpackov <boris@codesynthesis.com> 2016-11-25 11:18:34 +0200
commit: 28f8338ded34f160e0083da9be4679bc778be7ca (patch)
tree: 7bd01311683d835f946c73d7d8220f552bae718f
parent: f32bb0aceb00cfa4bd04eea72f8fa2fe02b738b3 (diff)
10 files changed, 315 insertions, 33 deletions
diff --git a/build2/lexer.cxx b/build2/lexer.cxx
index cf8a789..b73c291 100644
--- a/build2/lexer.cxx
+++ b/build2/lexer.cxx
@@ -295,7 +295,24 @@ namespace build2
     uint64_t ln (c.line), cn (c.column);
 
     string lexeme;
-    bool quoted (m == lexer_mode::double_quoted);
+    quote_type qtype (m == lexer_mode::double_quoted
+                      ? quote_type::double_
+                      : quote_type::unquoted);
+
+    // If we are already in the quoted mode then we didn't start with the
+    // quote character.
+    //
+    bool qcomp (false);
+
+    auto append = [&lexeme, &m, &qcomp] (char c)
+    {
+      lexeme += c;
+
+      // An unquoted character after a quoted fragment.
+      //
+      if (qcomp && m != lexer_mode::double_quoted)
+        qcomp = false;
+    };
 
     for (; !eos (c); c = peek ())
     {
@@ -321,7 +338,7 @@ namespace build2
             fail (p) << "unterminated escape sequence";
 
           if (p != '\n') // Ignore if line continuation.
-            lexeme += p;
+            append (p);
 
           continue;
         }
@@ -424,6 +441,22 @@ namespace build2
               //
               mode (lexer_mode::single_quoted);
 
+              switch (qtype)
+              {
+              case quote_type::unquoted:
+                qtype = quote_type::single;
+                qcomp = lexeme.empty ();
+                break;
+              case quote_type::single:
+                qcomp = false; // Non-contiguous.
+                break;
+              case quote_type::double_:
+                qtype = quote_type::mixed;
+              case quote_type::mixed:
+                qcomp = false;
+                break;
+              }
+
               get ();
               for (c = get (); !eos (c) && c != '\''; c = get ())
                 lexeme += c;
@@ -432,8 +465,6 @@ namespace build2
                 fail (c) << "unterminated single-quoted sequence";
 
               state_.pop ();
-
-              quoted = true;
               continue;
             }
           case '\"':
@@ -444,7 +475,22 @@ namespace build2
               st = state_.top ();
               m = st.mode;
 
-              quoted = true;
+              switch (qtype)
+              {
+              case quote_type::unquoted:
+                qtype = quote_type::double_;
+                qcomp = lexeme.empty ();
+                break;
+              case quote_type::double_:
+                qcomp = false; // Non-contiguous.
+                break;
+              case quote_type::single:
+                qtype = quote_type::mixed;
+              case quote_type::mixed:
+                qcomp = false;
+                break;
+              }
+
               continue;
             }
           }
@@ -455,19 +501,27 @@ namespace build2
         break;
 
       get ();
-      lexeme += c;
+      append (c);
     }
 
-    if (eos (c) && m == lexer_mode::double_quoted)
-      fail (c) << "unterminated double-quoted sequence";
+    if (m == lexer_mode::double_quoted)
+    {
+      if (eos (c))
+        fail (c) << "unterminated double-quoted sequence";
+
+      // If we are still in the quoted mode then we didn't end with the quote
+      // character.
+      //
+      if (qcomp)
+        qcomp = false;
+    }
 
     // Expire variable mode at the end of the word.
     //
     if (m == lexer_mode::variable)
       state_.pop ();
 
-    return token (move (lexeme), sep, quoted, ln, cn);
-
+    return token (move (lexeme), sep, qtype, qcomp, ln, cn);
   }
 
   bool lexer::
diff --git a/build2/parser.cxx b/build2/parser.cxx
index 5f9850d..c2737cb 100644
--- a/build2/parser.cxx
+++ b/build2/parser.cxx
@@ -2154,7 +2154,10 @@ namespace build2
             tt != type::lparen) || peeked ().separated))
       {
         tt = type::word;
-        t = token (move (concat_str), true, false, t.line, t.column);
+        t = token (move (concat_str),
+                   true,
+                   quote_type::unquoted, false,
+                   t.line, t.column);
         concat = false;
       }
       else if (!first)
@@ -2792,7 +2795,7 @@ namespace build2
     //
     // See tests/keyword.
     //
-    if (!t.quoted)
+    if (t.qtype == quote_type::unquoted)
     {
       // We cannot peek at the whole token here since it might have to be
       // lexed in a different mode. So peek at its first character.
diff --git a/build2/test/script/lexer.cxx b/build2/test/script/lexer.cxx
index 5e6c66a..19e7498 100644
--- a/build2/test/script/lexer.cxx
+++ b/build2/test/script/lexer.cxx
@@ -137,7 +137,7 @@ namespace build2
           break;
         }
 
-        if (r.quoted)
+        if (r.qtype != quote_type::unquoted)
           ++quoted_;
 
         return r;
@@ -448,7 +448,10 @@ namespace build2
           lexeme += c;
         }
 
-        return token (move (lexeme), false, false, ln, cn);
+        return token (move (lexeme),
+                      false,
+                      quote_type::unquoted, false,
+                      ln, cn);
       }
 
       token lexer::
@@ -480,7 +483,10 @@ namespace build2
         }
 
         state_.pop (); // Expire the variable mode.
-        return token (move (lexeme), sep, false, ln, cn);
+        return token (move (lexeme),
+                      sep,
+                      quote_type::unquoted, false,
+                      ln, cn);
       }
     }
   }
diff --git a/build2/test/script/parser.cxx b/build2/test/script/parser.cxx
index 9afef75..a116873 100644
--- a/build2/test/script/parser.cxx
+++ b/build2/test/script/parser.cxx
@@ -321,7 +321,7 @@ namespace build2
             //
             lt = line_type::cmd; // Default.
 
-            if (tt == type::word && !t.quoted)
+            if (tt == type::word && t.qtype == quote_type::unquoted)
             {
               const string& n (t.value);
 
@@ -353,7 +353,7 @@ namespace build2
             //
             lt = line_type::cmd; // Default.
 
-            if (tt == type::word && !t.quoted)
+            if (tt == type::word && t.qtype == quote_type::unquoted)
             {
               const string& n (t.value);
 
@@ -719,7 +719,7 @@ namespace build2
           const token& p (peeked ());
           const location ll (get_location (p));
 
-          if (pt == type::word && !p.quoted)
+          if (pt == type::word && p.qtype == quote_type::unquoted)
           {
             if      (p.value == "elif")  lt = line_type::cmd_elif;
             else if (p.value == "elif!") lt = line_type::cmd_elifn;
@@ -1652,7 +1652,7 @@ namespace build2
                   //
                   next (t, tt);
 
-                  if (tt != type::word || t.quoted)
+                  if (tt != type::word || t.qtype != quote_type::unquoted)
                     fail (l) << "expected here-document end marker";
 
                   hd.push_back (here_doc {0, 0, 0, move (t.value), nn});
@@ -1751,7 +1751,8 @@ namespace build2
               // quoted (note that the current token is "next" and is not part
               // of this).
               //
-              bool q ((quoted () - (t.quoted ? 1 : 0)) != 0);
+              bool q ((quoted () -
+                       (t.qtype != quote_type::unquoted ? 1 : 0)) != 0);
 
               for (name& n: ns)
               {
@@ -2074,7 +2075,9 @@ namespace build2
           // Check if this is the end marker. For starters, it should be a
           // single, unquoted word followed by a newline.
           //
-          if (tt == type::word && !t.quoted && peek () == type::newline)
+          if (tt == type::word &&
+              t.qtype == quote_type::unquoted &&
+              peek () == type::newline)
           {
             const string& v (t.value);
 
@@ -2652,7 +2655,7 @@ namespace build2
           // Examine tokens we have replayed since last reset.
           //
           for (size_t i (replay_quoted_); i != replay_i_; ++i)
-            if (replay_data_[i].token.quoted)
+            if (replay_data_[i].token.qtype != quote_type::unquoted)
               ++r;
         }
 
@@ -2663,14 +2666,14 @@ namespace build2
       reset_quoted (token& cur)
       {
         if (replay_ != replay::play)
-          lexer_->reset_quoted (cur.quoted ? 1 : 0);
+          lexer_->reset_quoted (cur.qtype != quote_type::unquoted ? 1 : 0);
         else
         {
           replay_quoted_ = replay_i_ - 1;
 
           // Must be the same token.
           //
-          assert (replay_data_[replay_quoted_].token.quoted == cur.quoted);
+          assert (replay_data_[replay_quoted_].token.qtype == cur.qtype);
         }
       }
 
diff --git a/build2/token b/build2/token
index b3ebf5b..df25d4c 100644
--- a/build2/token
+++ b/build2/token
@@ -56,6 +56,11 @@ namespace build2
     value_type v_;
   };
 
+  // Token can be unquoted, single-quoted ('') or double-quoted (""). It can
+  // also be mixed.
+  //
+  enum class quote_type {unquoted, single, double_, mixed};
+
   class token;
 
   void
@@ -68,7 +73,13 @@ namespace build2
 
     token_type type;
     bool separated; // Whitespace-separated from the previous token.
-    bool quoted;    // Word (or some part of it) was quoted.
+
+    // Quoting can be complete, where the token starts and ends with the quote
+    // characters and quoting is contiguous or partial where only some part(s)
+    // of the token are quoted or quoting continus to the next token.
+    //
+    quote_type qtype;
+    bool qcomp;
 
     string value;   // Only valid for word.
 
@@ -82,12 +93,16 @@ namespace build2
         : token (token_type::eos, false, 0, 0, token_printer) {}
 
     token (token_type t, bool s, uint64_t l, uint64_t c, printer_type* p)
-        : type (t), separated (s), quoted (false),
+        : type (t), separated (s), qtype (quote_type::unquoted),
           line (l), column (c),
           printer (p) {}
 
-    token (string v, bool s, bool q, uint64_t l, uint64_t c)
-        : type (token_type::word), separated (s), quoted (q), value (move (v)),
+    token (string v, bool s,
+           quote_type qt, bool qc,
+           uint64_t l, uint64_t c)
+        : type (token_type::word), separated (s),
+          qtype (qt), qcomp (qc),
+          value (move (v)),
           line (l), column (c),
           printer (&token_printer) {}
   };
diff --git a/unit-tests/buildfile b/unit-tests/buildfile
index 5d06ec7..f8cfb9d 100644
--- a/unit-tests/buildfile
+++ b/unit-tests/buildfile
@@ -2,6 +2,6 @@
 # copyright : Copyright (c) 2014-2016 Code Synthesis Ltd
 # license   : MIT; see accompanying LICENSE file
 
-d = function/ test/script/
+d = function/ lexer/ test/script/
 ./: $d
 include $d
diff --git a/unit-tests/lexer/buildfile b/unit-tests/lexer/buildfile
new file mode 100644
index 0000000..d9bd2df
--- /dev/null
+++ b/unit-tests/lexer/buildfile
@@ -0,0 +1,13 @@
+# file      : unit-tests/lexer/buildfile
+# copyright : Copyright (c) 2014-2016 Code Synthesis Ltd
+# license   : MIT; see accompanying LICENSE file
+
+#@@ Temporary until we get utility library support.
+#
+import libs = libbutl%lib{butl}
+src = token lexer diagnostics utility variable name b-options types-parsers
+
+exe{driver}: cxx{driver} ../../build2/cxx{$src} $libs \
+test{comment quoting}
+
+include ../../build2/
diff --git a/unit-tests/test/script/lexer/comment.test b/unit-tests/lexer/comment.test
index 0092ed9..07d7ac5 100644
--- a/unit-tests/test/script/lexer/comment.test
+++ b/unit-tests/lexer/comment.test
@@ -1,7 +1,6 @@
-# @@ This one should be moved to build2/lexer since we use base lexer
-#    functionality as is.
-#
-test.arguments += script-line
+# file      : unit-tests/lexer/comment.test
+# copyright : Copyright (c) 2014-2016 Code Synthesis Ltd
+# license   : MIT; see accompanying LICENSE file
 
 # Single-line comments.
 
diff --git a/unit-tests/lexer/driver.cxx b/unit-tests/lexer/driver.cxx
new file mode 100644
index 0000000..326ac8a
--- /dev/null
+++ b/unit-tests/lexer/driver.cxx
@@ -0,0 +1,94 @@
+// file      : unit-tests/lexer/driver.cxx -*- C++ -*-
+// copyright : Copyright (c) 2014-2016 Code Synthesis Ltd
+// license   : MIT; see accompanying LICENSE file
+
+#include <cassert>
+#include <iostream>
+
+#include <build2/types>
+#include <build2/utility>
+
+#include <build2/token>
+#include <build2/lexer>
+
+using namespace std;
+
+namespace build2
+{
+  // Usage: argv[0] [-q] [<lexer-mode>]
+  //
+  int
+  main (int argc, char* argv[])
+  {
+    bool quote (false);
+    lexer_mode m (lexer_mode::normal);
+
+    for (int i (1); i != argc; ++i)
+    {
+      string a (argv[i]);
+
+      if (a == "-q")
+        quote = true;
+      else
+      {
+        if      (a == "normal")    m = lexer_mode::normal;
+        else if (a == "variable")  m = lexer_mode::variable;
+        else if (a == "value")     m = lexer_mode::value;
+        else if (a == "attribute") m = lexer_mode::attribute;
+        else if (a == "eval")      m = lexer_mode::eval;
+        else                       assert (false);
+        break;
+      }
+    }
+
+    try
+    {
+      cin.exceptions (istream::failbit | istream::badbit);
+
+      // Most alternative modes auto-expire so we need something underneath.
+      //
+      lexer l (cin, path ("stdin"));
+
+      if (m != lexer_mode::normal)
+        l.mode (m);
+
+      // No use printing eos since we will either get it or loop forever.
+      //
+      for (token t (l.next ()); t.type != token_type::eos; t = l.next ())
+      {
+        // Print each token on a separate line without quoting operators.
+        //
+        t.printer (cout, t, false);
+
+        if (quote)
+        {
+          char q ('\0');
+          switch (t.qtype)
+          {
+          case quote_type::single:   q = 'S'; break;
+          case quote_type::double_:  q = 'D'; break;
+          case quote_type::mixed:    q = 'M'; break;
+          case quote_type::unquoted:          break;
+          }
+
+          if (q != '\0')
+            cout << " [" << q << (t.qcomp ? "/C" : "/P") << ']';
+        }
+
+        cout << endl;
+      }
+    }
+    catch (const failed&)
+    {
+      return 1;
+    }
+
+    return 0;
+  }
+}
+
+int
+main (int argc, char* argv[])
+{
+  return build2::main (argc, argv);
+}
diff --git a/unit-tests/lexer/quoting.test b/unit-tests/lexer/quoting.test
new file mode 100644
index 0000000..76fd904
--- /dev/null
+++ b/unit-tests/lexer/quoting.test
@@ -0,0 +1,95 @@
+# file      : unit-tests/lexer/quoting.test
+# copyright : Copyright (c) 2014-2016 Code Synthesis Ltd
+# license   : MIT; see accompanying LICENSE file
+
+test.options += -q
+
+: unquoted
+:
+$* <'foo' >>EOO
+'foo'
+<newline>
+EOO
+
+: single-comp
+:
+$* <":'foo':" >>EOO
+:
+'foo' [S/C]
+:
+<newline>
+EOO
+
+: double-comp
+:
+$* <':"foo":' >>EOO
+:
+'foo' [D/C]
+:
+<newline>
+EOO
+
+: single-empty-comp
+:
+$* <"''" >>EOO
+'' [S/C]
+<newline>
+EOO
+
+: double-empty-comp
+:
+$* <'""' >>EOO
+'' [D/C]
+<newline>
+EOO
+
+: part-start-quoted
+: Token start already quoted
+:
+$* <'"$foo"' >>EOO
+'' [D/P]
+\$
+'foo' [D/P]
+<newline>
+EOO
+
+: part-end-quoted
+: Token end still quoted
+:
+$* <'"foo$"' >>EOO
+'foo' [D/P]
+\$
+'' [D/P]
+<newline>
+EOO
+
+: part-start-unquoted
+: Token starts with unquoted character
+:
+$* <'f"oo"' >>EOO
+'foo' [D/P]
+<newline>
+EOO
+
+: part-unquoted
+: Token continous with unquoted character
+:
+$* <'"fo"o' >>EOO
+'foo' [D/P]
+<newline>
+EOO
+
+: part-unquoted-escape
+: Token continous with unquoted escaped character
+:
+$* <'"fo"\"' >>EOO
+'fo"' [D/P]
+<newline>
+EOO
+
+: mixed
+:
+$* <"\"fo\"'o'" >>EOO
+'foo' [M/P]
+<newline>
+EOO
author	Boris Kolpackov <boris@codesynthesis.com>	2016-11-25 11:18:34 +0200
committer	Boris Kolpackov <boris@codesynthesis.com>	2016-11-25 11:18:34 +0200
commit	28f8338ded34f160e0083da9be4679bc778be7ca (patch)
tree	7bd01311683d835f946c73d7d8220f552bae718f
parent	f32bb0aceb00cfa4bd04eea72f8fa2fe02b738b3 (diff)