Improve escape sequence support

Specifically: 1. In the double-quoted strings we now only do effective escaping of the special `$("\` characters plus `)` for symmetry. 2. There is now support for "escape sequence expansion" in the form $\X where \X can be any of the C/C++ simple escape sequences (\n, \t, etc) plus \0 (which in C/C++ is an octal escape sequence). For example: info "foo$\n$\tbar$\n$\tbaz" Will print: buildfile:1:1: info: foo bar baz
author: Boris Kolpackov <boris@codesynthesis.com> 2022-12-15 11:24:18 +0200
committer: Boris Kolpackov <boris@codesynthesis.com> 2022-12-15 11:24:18 +0200
commit: 3ca670b7b7c71ca67d70cac9dffb2ba6120b2e36 (patch)
tree: 1424ac78fe10f697c8a0b63d91bb49889e8cdc85 /libbuild2
parent: 0aa7a94e1032a96a2a72cb6a82824f9fe970d412 (diff)
7 files changed, 274 insertions, 172 deletions
diff --git a/libbuild2/lexer.cxx b/libbuild2/lexer.cxx
index 9176422..d82c135 100644
--- a/libbuild2/lexer.cxx
+++ b/libbuild2/lexer.cxx
@@ -713,9 +713,9 @@ namespace build2
   }
 
   token lexer::
-  word (state st, bool sep)
+  word (const state& rst, bool sep)
   {
-    lexer_mode m (st.mode);
+    lexer_mode m (rst.mode);
 
     xchar c (peek ());
     assert (!eos (c));
@@ -746,22 +746,66 @@ namespace build2
       lexeme += c;
     };
 
-    for (; !eos (c); c = peek ())
+    const state* st (&rst);
+    for (bool first (true); !eos (c); first = false, c = peek ())
     {
       // First handle escape sequences.
       //
       if (c == '\\')
       {
-        // In the variable mode we treat the beginning of the escape sequence
-        // as a separator (think \"$foo\").
+        // In the variable mode we treat immediate `\` as the escape sequence
+        // literal and any following as a separator (think \"$foo\").
         //
         if (m == lexer_mode::variable)
-          break;
+        {
+          if (!first)
+            break;
+
+          get ();
+          c = get ();
+
+          if (eos (c))
+            fail (c) << "unterminated escape sequence";
+
+          // For now we only support all the simple C/C++ escape sequences
+          // plus \0 (which in C/C++ is an octal escape sequence).
+          //
+          // In the future we may decide to support more elaborate sequences
+          // such as \xNN, \uNNNN, etc.
+          //
+          // Note: we return it in the literal form instead of translating for
+          // easier printing.
+          //
+          switch (c)
+          {
+          case '\'':
+          case '"':
+          case '?':
+          case '\\':
+          case '0':
+          case 'a':
+          case 'b':
+          case 'f':
+          case 'n':
+          case 'r':
+          case 't':
+          case 'v': lexeme = c; break;
+          default:
+            fail (c) << "unknown escape sequence \\" << c;
+          }
+
+          state_.pop ();
+          return token (type::escape,
+                        move (lexeme),
+                        sep,
+                        qtype, qcomp, qfirst,
+                        ln, cn);
+        }
 
         get ();
         xchar p (peek ());
 
-        const char* esc (st.escapes);
+        const char* esc (st->escapes);
 
         if (esc == nullptr ||
             (*esc != '\0' && !eos (p) && strchr (esc, p) != nullptr))
@@ -777,7 +821,7 @@ namespace build2
           continue;
         }
         else
-          unget (c); // Treat as a normal character.
+          unget (c); // Fall through to treat as a normal character.
       }
 
       bool done (false);
@@ -806,8 +850,8 @@ namespace build2
             get ();
             state_.pop ();
 
-            st = state_.top ();
-            m = st.mode;
+            st = &state_.top ();
+            m = st->mode;
             continue;
           }
         }
@@ -816,19 +860,17 @@ namespace build2
       //
       else if (m == lexer_mode::variable)
       {
-        bool first (lexeme.empty ());
-
         // Handle special variable names, if any.
         //
-        if (first        &&
-            st.data != 0 &&
-            strchr (reinterpret_cast<const char*> (st.data), c) != nullptr)
+        if (first         &&
+            st->data != 0 &&
+            strchr (reinterpret_cast<const char*> (st->data), c) != nullptr)
         {
           get ();
           lexeme += c;
           done = true;
         }
-        else if (c != '_' && !(first ? alpha (c) : alnum (c)))
+        else if (c != '_' && !(lexeme.empty () ? alpha (c) : alnum (c)))
         {
           if (c != '.')
             done = true;
@@ -848,17 +890,17 @@ namespace build2
       {
         // First check if it's a pair separator.
         //
-        if (c == st.sep_pair)
+        if (c == st->sep_pair)
           done = true;
         else
         {
           // Then see if this character or character sequence is a separator.
           //
-          for (const char* p (strchr (st.sep_first, c));
+          for (const char* p (strchr (st->sep_first, c));
                p != nullptr;
                p = done ? nullptr : strchr (p + 1, c))
           {
-            char s (st.sep_second[p - st.sep_first]);
+            char s (st->sep_second[p - st->sep_first]);
 
             // See if it has a second.
             //
@@ -876,13 +918,19 @@ namespace build2
         // Handle single and double quotes if enabled for this mode and unless
         // they were considered separators.
         //
-        if (st.quotes && !done)
+        if (st->quotes && !done)
         {
           auto quoted_mode = [this] (lexer_mode m)
           {
+            // In the double-quoted mode we only do effective escaping of the
+            // special `$("\` characters plus `)` for symmetry. Nothing can be
+            // escaped in single-quoted.
+            //
+            const char* esc (m == lexer_mode::double_quoted ? "$()\"\\" : "");
+
             state_.push (state {
               m, 0, nullopt, false, false, '\0', false, true, true,
-              state_.top ().escapes, nullptr, nullptr});
+              esc, nullptr, nullptr});
           };
 
           switch (c)
@@ -933,8 +981,8 @@ namespace build2
 
               quoted_mode (lexer_mode::double_quoted);
 
-              st = state_.top ();
-              m = st.mode;
+              st = &state_.top ();
+              m = st->mode;
 
               switch (qtype)
               {
@@ -1090,6 +1138,8 @@ namespace build2
         }
       case '\\':
         {
+          // See if this is line continuation.
+          //
           get ();
 
           if (peek () == '\n')
diff --git a/libbuild2/lexer.hxx b/libbuild2/lexer.hxx
index 4371206..e913829 100644
--- a/libbuild2/lexer.hxx
+++ b/libbuild2/lexer.hxx
@@ -26,14 +26,15 @@ namespace build2
   // mode we don't treat certain characters (e.g., `+`, `=`) as special so
   // that we can use them in the variable values, e.g., `foo = g++`. In
   // contrast, in the variable mode, we restrict certain character (e.g., `/`)
-  // from appearing in the name. The values mode is like value but recogizes
-  // `,` as special (used in contexts where we need to list multiple
-  // values). The attributes/attribute_value modes are like values where each
-  // value is potentially a variable assignment; they don't treat `{` and `}`
-  // as special (so we cannot have name groups in attributes) as well as
-  // recognizes `=` and `]`. The subscript mode is like value but doesn't
-  // treat `{` and `}` as special and recognizes `]`. The eval mode is used in
-  // the evaluation context.
+  // from appearing in the name. Additionally, in the variable mode we
+  // recognize leading `\` as the beginning of the escape sequent ($\n). The
+  // values mode is like value but recogizes `,` as special (used in contexts
+  // where we need to list multiple values). The attributes/attribute_value
+  // modes are like values where each value is potentially a variable
+  // assignment; they don't treat `{` and `}` as special (so we cannot have
+  // name groups in attributes) as well as recognizes `=` and `]`. The
+  // subscript mode is like value but doesn't treat `{` and `}` as special and
+  // recognizes `]`. The eval mode is used in the evaluation context.
   //
   // A number of modes are "derived" from the value/values mode by recognizing
   // a few extra characters:
@@ -262,7 +263,7 @@ namespace build2
     // been "expired" from the top).
     //
     virtual token
-    word (state current, bool separated);
+    word (const state& current, bool separated);
 
     // Return true in first if we have seen any spaces. Skipped empty lines
     // don't count. In other words, we are only interested in spaces that are
diff --git a/libbuild2/parser.cxx b/libbuild2/parser.cxx
index b118cee..2507a02 100644
--- a/libbuild2/parser.cxx
+++ b/libbuild2/parser.cxx
@@ -7357,11 +7357,15 @@ namespace build2
           // token is a paren or a word, we turn it on and switch to the eval
           // mode if what we get next is a paren.
           //
-          // Also sniff out the special variables string from mode data for
-          // the ad hoc $() handling below.
-          //
           mode (lexer_mode::variable);
 
+          // Sniff out the special variables string from mode data and use
+          // that to recognize special variables in the ad hoc $() handling
+          // below.
+          //
+          // Note: must be done before calling next() which may expire the
+          // mode.
+          //
           auto special = [s = reinterpret_cast<const char*> (mode_data ())]
             (const token& t) -> char
           {
@@ -7400,164 +7404,202 @@ namespace build2
           next (t, tt);
           loc = get_location (t);
 
-          names qual;
-          string name;
-
-          if (t.separated)
-            ; // Leave the name empty to fail below.
-          else if (tt == type::word)
+          if (tt == type::escape)
           {
-            name = move (t.value);
+            // For now we only support all the simple C/C++ escape sequences
+            // plus \0 (which in C/C++ is an octal escape sequence). See the
+            // lexer part for details.
+            //
+            // Note: cannot be subscripted.
+            //
+            if (!pre_parse_)
+            {
+              string s;
+              switch (char c = t.value[0])
+              {
+              case '\'':
+              case '"':
+              case '?':
+              case '\\': s = c;    break;
+              case '0':  s = '\0'; break;
+              case 'a':  s = '\a'; break;
+              case 'b':  s = '\b'; break;
+              case 'f':  s = '\f'; break;
+              case 'n':  s = '\n'; break;
+              case 'r':  s = '\r'; break;
+              case 't':  s = '\t'; break;
+              case 'v':  s = '\v'; break;
+              default:
+                assert (false);
+              }
+
+              result_data = name (move (s));
+              what = "escape sequence expansion";
+            }
+
+            tt = peek ();
           }
-          else if (tt == type::lparen)
+          else
           {
-            expire_mode ();
-            mode (lexer_mode::eval, '@');
-            next_with_attributes (t, tt);
+            names qual;
+            string name;
 
-            // Handle the $(x) case ad hoc. We do it this way in order to get
-            // the variable name even during pre-parse. It should also be
-            // faster.
-            //
-            char c;
-            if ((tt == type::word
-                 ? path_traits::rfind_separator (t.value) == string::npos
-                 : (c = special (t))) &&
-                peek () == type::rparen)
+            if (t.separated)
+              ; // Leave the name empty to fail below.
+            else if (tt == type::word)
             {
-              name = (tt == type::word ? move (t.value) : string (1, c));
-              next (t, tt); // Get `)`.
+              name = move (t.value);
             }
-            else
+            else if (tt == type::lparen)
             {
-              using name_type = build2::name;
-
-              values vs (parse_eval (t, tt, pmode));
+              expire_mode ();
+              mode (lexer_mode::eval, '@');
+              next_with_attributes (t, tt);
 
-              if (!pre_parse_)
+              // Handle the $(x) case ad hoc. We do it this way in order to
+              // get the variable name even during pre-parse. It should also
+              // be faster.
+              //
+              char c;
+              if ((tt == type::word
+                   ? path_traits::rfind_separator (t.value) == string::npos
+                   : (c = special (t))) &&
+                  peek () == type::rparen)
               {
-                if (vs.size () != 1)
-                  fail (loc) << "expected single variable/function name";
+                name = (tt == type::word ? move (t.value) : string (1, c));
+                next (t, tt); // Get `)`.
+              }
+              else
+              {
+                using name_type = build2::name;
 
-                value& v (vs[0]);
+                values vs (parse_eval (t, tt, pmode));
 
-                if (!v)
-                  fail (loc) << "null variable/function name";
+                if (!pre_parse_)
+                {
+                  if (vs.size () != 1)
+                    fail (loc) << "expected single variable/function name";
 
-                names storage;
-                vector_view<name_type> ns (
-                  reverse (v, storage, true /* reduce */)); // Movable.
-                size_t n (ns.size ());
+                  value& v (vs[0]);
 
-                // We cannot handle scope-qualification in the eval context as
-                // we do for target-qualification (see eval-qual) since then
-                // we would be treating all paths as qualified variables. So
-                // we have to do it here.
-                //
-                if      (n >= 2 && ns[0].pair == ':')   // $(foo: x)
-                {
-                  // Note: name is first (see eval for details).
+                  if (!v)
+                    fail (loc) << "null variable/function name";
+
+                  names storage;
+                  vector_view<name_type> ns (
+                    reverse (v, storage, true /* reduce */)); // Movable.
+                  size_t n (ns.size ());
+
+                  // We cannot handle scope-qualification in the eval context
+                  // as we do for target-qualification (see eval-qual) since
+                  // then we would be treating all paths as qualified
+                  // variables. So we have to do it here.
                   //
-                  qual.push_back (move (ns[1]));
+                  if      (n >= 2 && ns[0].pair == ':')   // $(foo: x)
+                  {
+                    // Note: name is first (see eval for details).
+                    //
+                    qual.push_back (move (ns[1]));
 
-                  if (qual.back ().empty ())
-                    fail (loc) << "empty variable/function qualification";
+                    if (qual.back ().empty ())
+                      fail (loc) << "empty variable/function qualification";
 
-                  if (n > 2)
-                    qual.push_back (move (ns[2]));
+                    if (n > 2)
+                      qual.push_back (move (ns[2]));
 
-                  // Move name to the last position (see below).
-                  //
-                  swap (ns[0], ns[n - 1]);
-                }
-                else if (n == 2 && ns[0].directory ())  // $(foo/ x)
-                {
-                  qual.push_back (move (ns[0]));
-                  qual.back ().pair = '/';
-                }
-                else if (n > 1)
-                  fail (loc) << "expected variable/function name instead of '"
-                             << ns << "'";
+                    // Move name to the last position (see below).
+                    //
+                    swap (ns[0], ns[n - 1]);
+                  }
+                  else if (n == 2 && ns[0].directory ())  // $(foo/ x)
+                  {
+                    qual.push_back (move (ns[0]));
+                    qual.back ().pair = '/';
+                  }
+                  else if (n > 1)
+                    fail (loc) << "expected variable/function name instead of '"
+                               << ns << "'";
 
-                // Note: checked for empty below.
-                //
-                if (!ns[n - 1].simple ())
-                  fail (loc) << "expected variable/function name instead of '"
-                             << ns[n - 1] << "'";
+                  // Note: checked for empty below.
+                  //
+                  if (!ns[n - 1].simple ())
+                    fail (loc) << "expected variable/function name instead of '"
+                               << ns[n - 1] << "'";
 
-                size_t p;
-                if (n == 1 &&                           // $(foo/x)
-                    (p = path_traits::rfind_separator (ns[0].value)) !=
+                  size_t p;
+                  if (n == 1 &&                           // $(foo/x)
+                      (p = path_traits::rfind_separator (ns[0].value)) !=
                       string::npos)
-                {
-                  // Note that p cannot point to the last character since then
-                  // it would have been a directory, not a simple name.
-                  //
-                  string& s (ns[0].value);
+                  {
+                    // Note that p cannot point to the last character since
+                    // then it would have been a directory, not a simple name.
+                    //
+                    string& s (ns[0].value);
 
-                  name = string (s, p + 1);
-                  s.resize (p + 1);
-                  qual.push_back (name_type (dir_path (move (s))));
-                  qual.back ().pair = '/';
+                    name = string (s, p + 1);
+                    s.resize (p + 1);
+                    qual.push_back (name_type (dir_path (move (s))));
+                    qual.back ().pair = '/';
+                  }
+                  else
+                    name = move (ns[n - 1].value);
                 }
-                else
-                  name = move (ns[n - 1].value);
               }
             }
-          }
-          else
-            fail (t) << "expected variable/function name instead of " << t;
-
-          if (!pre_parse_ && name.empty ())
-            fail (loc) << "empty variable/function name";
-
-          // Figure out whether this is a variable expansion with potential
-          // subscript or a function call.
-          //
-          if (sub) enable_subscript ();
-          tt = peek ();
+            else
+              fail (t) << "expected variable/function name instead of " << t;
 
-          // Note that we require function call opening paren to be
-          // unseparated; consider: $x ($x == 'foo' ? 'FOO' : 'BAR').
-          //
-          if (tt == type::lparen && !peeked ().separated)
-          {
-            // Function call.
-            //
-            next (t, tt); // Get '('.
-            mode (lexer_mode::eval, '@');
-            next_with_attributes (t, tt);
+            if (!pre_parse_ && name.empty ())
+              fail (loc) << "empty variable/function name";
 
-            // @@ Should we use (target/scope) qualification (of name) as the
-            // context in which to call the function? Hm, interesting...
+            // Figure out whether this is a variable expansion with potential
+            // subscript or a function call.
             //
-            values args (parse_eval (t, tt, pmode));
-
             if (sub) enable_subscript ();
             tt = peek ();
 
-            // Note that we "move" args to call().
+            // Note that we require function call opening paren to be
+            // unseparated; consider: $x ($x == 'foo' ? 'FOO' : 'BAR').
             //
-            if (!pre_parse_)
+            if (tt == type::lparen && !peeked ().separated)
             {
-              result_data = ctx->functions.call (scope_, name, args, loc);
-              what = "function call";
+              // Function call.
+              //
+              next (t, tt); // Get '('.
+              mode (lexer_mode::eval, '@');
+              next_with_attributes (t, tt);
+
+              // @@ Should we use (target/scope) qualification (of name) as
+              // the context in which to call the function? Hm, interesting...
+              //
+              values args (parse_eval (t, tt, pmode));
+
+              if (sub) enable_subscript ();
+              tt = peek ();
+
+              // Note that we "move" args to call().
+              //
+              if (!pre_parse_)
+              {
+                result_data = ctx->functions.call (scope_, name, args, loc);
+                what = "function call";
+              }
+              else
+                lookup_function (move (name), loc);
             }
             else
-              lookup_function (move (name), loc);
-          }
-          else
-          {
-            // Variable expansion.
-            //
-            lookup l (lookup_variable (move (qual), move (name), loc));
-
-            if (!pre_parse_)
             {
-              if (l.defined ())
-                result = l.value; // Otherwise leave as NULL result_data.
+              // Variable expansion.
+              //
+              lookup l (lookup_variable (move (qual), move (name), loc));
 
-              what = "variable expansion";
+              if (!pre_parse_)
+              {
+                if (l.defined ())
+                  result = l.value; // Otherwise leave as NULL result_data.
+
+                what = "variable expansion";
+              }
             }
           }
         }
diff --git a/libbuild2/test/script/lexer.cxx b/libbuild2/test/script/lexer.cxx
index b470d25..aec91fc 100644
--- a/libbuild2/test/script/lexer.cxx
+++ b/libbuild2/test/script/lexer.cxx
@@ -339,15 +339,17 @@ namespace build2
       }
 
       token lexer::
-      word (state st, bool sep)
+      word (const state& st, bool sep)
       {
-        lexer_mode m (st.mode);
+        lexer_mode m (st.mode); // Save.
 
         token r (base_lexer::word (st, sep));
 
         if (m == lexer_mode::variable)
         {
-          if (r.value.size () == 1 && digit (r.value[0])) // $N
+          if (r.type == type::word &&
+              r.value.size () == 1 &&
+              digit (r.value[0])) // $N
           {
             xchar c (peek ());
 
diff --git a/libbuild2/test/script/lexer.hxx b/libbuild2/test/script/lexer.hxx
index 993a9db..39b950a 100644
--- a/libbuild2/test/script/lexer.hxx
+++ b/libbuild2/test/script/lexer.hxx
@@ -77,7 +77,7 @@ namespace build2
         next_description ();
 
         virtual token
-        word (state, bool) override;
+        word (const state&, bool) override;
       };
     }
   }
diff --git a/libbuild2/token.cxx b/libbuild2/token.cxx
index ab14388..cc102cc 100644
--- a/libbuild2/token.cxx
+++ b/libbuild2/token.cxx
@@ -29,21 +29,30 @@ namespace build2
         os << (r ? "\n" : "<newline>");
         break;
       }
-    case token_type::pair_separator:
+    case token_type::word:
       {
         if (r)
-          os << t.value[0];
+          os << t.value;
         else
-          os << "<pair separator " << t.value[0] << ">";
+          os << '\'' << t.value << '\'';
 
         break;
       }
-    case token_type::word:
+    case token_type::escape:
       {
         if (r)
-          os << t.value;
+          os << '\\' << t.value;
         else
-          os << '\'' << t.value << '\'';
+          os << "<escape sequence \\" << t.value << ">";
+
+        break;
+      }
+    case token_type::pair_separator:
+      {
+        if (r)
+          os << t.value[0];
+        else
+          os << "<pair separator " << t.value[0] << ">";
 
         break;
       }
diff --git a/libbuild2/token.hxx b/libbuild2/token.hxx
index fca888c..f9ede65 100644
--- a/libbuild2/token.hxx
+++ b/libbuild2/token.hxx
@@ -30,6 +30,7 @@ namespace build2
       eos,
       newline,
       word,
+      escape,          // token::value is <...> in $\<...>
       pair_separator,  // token::value[0] is the pair separator char.
 
       colon,           // :
@@ -159,16 +160,13 @@ namespace build2
     token (string v, bool s,
            quote_type qt, bool qc, bool qf,
            uint64_t l, uint64_t c)
-        : token (token_type::word, move (v), s,
-                 qt, qc, qf,
-                 l, c,
-                 &token_printer) {}
+        : token (token_type::word, move (v), s, qt, qc, qf, l, c) {}
 
     token (token_type t,
            string v, bool s,
            quote_type qt, bool qc, bool qf,
            uint64_t l, uint64_t c,
-           printer_type* p)
+           printer_type* p = &token_printer)
         : type (t), separated (s),
           qtype (qt), qcomp (qc), qfirst (qf),
           value (move (v)),
author	Boris Kolpackov <boris@codesynthesis.com>	2022-12-15 11:24:18 +0200
committer	Boris Kolpackov <boris@codesynthesis.com>	2022-12-15 11:24:18 +0200
commit	3ca670b7b7c71ca67d70cac9dffb2ba6120b2e36 (patch)
tree	1424ac78fe10f697c8a0b63d91bb49889e8cdc85 /libbuild2
parent	0aa7a94e1032a96a2a72cb6a82824f9fe970d412 (diff)