From b408d19f614b47670cd0a0def501266f0d7689b5 Mon Sep 17 00:00:00 2001 From: Karen Arutyunov Date: Tue, 18 Oct 2022 15:13:29 +0300 Subject: Fix unexpected 'unterminated double-quoted sequence' script error --- libbuild2/build/script/lexer.cxx | 17 +++--- libbuild2/build/script/lexer.hxx | 2 + libbuild2/build/script/parser+diag.test.testscript | 13 +++-- libbuild2/build/script/parser+for.test.testscript | 12 ++-- libbuild2/lexer.cxx | 67 +++++++++++++++++++--- libbuild2/lexer.hxx | 46 ++++++++++++--- libbuild2/script/lexer.cxx | 11 ++-- libbuild2/script/lexer.hxx | 2 + libbuild2/test/script/lexer.cxx | 19 +++--- libbuild2/test/script/lexer.hxx | 2 + libbuild2/test/script/parser+for.test.testscript | 15 +++++ 11 files changed, 152 insertions(+), 54 deletions(-) (limited to 'libbuild2') diff --git a/libbuild2/build/script/lexer.cxx b/libbuild2/build/script/lexer.cxx index 5c13239..e0d87fe 100644 --- a/libbuild2/build/script/lexer.cxx +++ b/libbuild2/build/script/lexer.cxx @@ -35,10 +35,7 @@ namespace build2 bool q (true); // quotes if (!esc) - { - assert (!state_.empty ()); - esc = state_.top ().escapes; - } + esc = current_state ().escapes; switch (m) { @@ -107,7 +104,7 @@ namespace build2 } assert (ps == '\0'); - state_.push ( + mode_impl ( state {m, data, nullopt, false, false, ps, s, n, q, *esc, s1, s2}); } @@ -116,7 +113,7 @@ namespace build2 { token r; - switch (state_.top ().mode) + switch (mode ()) { case lexer_mode::command_line: case lexer_mode::first_token: @@ -142,7 +139,7 @@ namespace build2 xchar c (get ()); uint64_t ln (c.line), cn (c.column); - state st (state_.top ()); // Make copy (see first/second_token). + state st (current_state ()); // Make copy (see first/second_token). lexer_mode m (st.mode); auto make_token = [&sep, ln, cn] (type t) @@ -158,7 +155,7 @@ namespace build2 assert (m == lexer_mode::variable_line || m == lexer_mode::for_loop); - state_.top ().lsbrace = false; // Note: st is a copy. + current_state ().lsbrace = false; // Note: st is a copy. if (c == '[' && (!st.lsbrace_unsep || !sep)) return make_token (type::lsbrace); @@ -171,7 +168,7 @@ namespace build2 // we push any new mode (e.g., double quote). // if (m == lexer_mode::first_token || m == lexer_mode::second_token) - state_.pop (); + expire_mode (); // NOTE: remember to update mode() if adding new special characters. @@ -182,7 +179,7 @@ namespace build2 // Expire variable value mode at the end of the line. // if (m == lexer_mode::variable_line) - state_.pop (); + expire_mode (); sep = true; // Treat newline as always separated. return make_token (type::newline); diff --git a/libbuild2/build/script/lexer.hxx b/libbuild2/build/script/lexer.hxx index 313d80a..3f51493 100644 --- a/libbuild2/build/script/lexer.hxx +++ b/libbuild2/build/script/lexer.hxx @@ -68,6 +68,8 @@ namespace build2 static redirect_aliases_type redirect_aliases; private: + using build2::script::lexer::mode; // Getter. + token next_line (); }; diff --git a/libbuild2/build/script/parser+diag.test.testscript b/libbuild2/build/script/parser+diag.test.testscript index 272d10c..a720fe2 100644 --- a/libbuild2/build/script/parser+diag.test.testscript +++ b/libbuild2/build/script/parser+diag.test.testscript @@ -45,16 +45,19 @@ $* <>EOO diag: copy foo EOO - # @@ TMP Enable when the unexpected 'unterminated double-quoted sequence' - # error is fixed. - #\ : quoted : - $* <false + $* <'diag: foo' f = foo diag "$f" EOI - #\ + + : quoted-eval + : + $* <'diag: foo' + f = foo + diag "($f)" + EOI : temp_dir : diff --git a/libbuild2/build/script/parser+for.test.testscript b/libbuild2/build/script/parser+for.test.testscript index 880b03c..2a9f169 100644 --- a/libbuild2/build/script/parser+for.test.testscript +++ b/libbuild2/build/script/parser+for.test.testscript @@ -543,20 +543,20 @@ buildfile:11:1: error: for: missing variable name EOE - # @@ TMP Enable when the unexpected 'unterminated double-quoted sequence' - # error is fixed. - #\ - : quoted-ops + : quoted-opt : $* <>EOO - o=-w + o = -w for "$o" x <'a b' cmd $x end + for "($o)" x <'a b' + cmd $x + end EOI for -w x <'a b' + for -w x <'a b' EOO - #\ : untyped : diff --git a/libbuild2/lexer.cxx b/libbuild2/lexer.cxx index 76c31be..9176422 100644 --- a/libbuild2/lexer.cxx +++ b/libbuild2/lexer.cxx @@ -160,13 +160,15 @@ namespace build2 break; } case lexer_mode::foreign: - assert (data > 1); - // Fall through. + { + assert (ps == '\0' && data > 1); + s = false; + break; + } case lexer_mode::single_quoted: case lexer_mode::double_quoted: { - assert (ps == '\0'); - s = false; + assert (false); // Can only be set manually in word(). break; } case lexer_mode::variable: @@ -178,8 +180,49 @@ namespace build2 default: assert (false); // Unhandled custom mode. } - state_.push ( - state {m, data, nullopt, lsb, false, ps, s, n, q, *esc, s1, s2}); + mode_impl (state {m, data, nullopt, lsb, false, ps, s, n, q, *esc, s1, s2}); + } + + void lexer:: + mode_impl (state&& s) + { + // If we are in the double-quoted mode then, unless the new mode is eval + // or variable, delay the state switch until the current mode is expired. + // Note that we delay by injecting the new state beneath the current + // state. + // + if (!state_.empty () && + state_.top ().mode == lexer_mode::double_quoted && + s.mode != lexer_mode::eval && + s.mode != lexer_mode::variable) + { + state qs (move (state_.top ())); // Save quoted state. + state_.top () = move (s); // Overwrite quoted state with new state. + state_.push (move (qs)); // Restore quoted state. + } + else + state_.push (move (s)); + } + + void lexer:: + expire_mode () + { + // If we are in the double-quoted mode, then delay the state expiration + // until the current mode is expired. Note that we delay by overwriting + // the being expired state with the current state. + // + assert (!state_.empty () && + (state_.top ().mode != lexer_mode::double_quoted || + state_.size () > 1)); + + if (state_.top ().mode == lexer_mode::double_quoted) + { + state qs (move (state_.top ())); // Save quoted state. + state_.pop (); // Pop quoted state. + state_.top () = move (qs); // Expire state, restoring quoted state. + } + else + state_.pop (); } token lexer:: @@ -835,6 +878,13 @@ namespace build2 // if (st.quotes && !done) { + auto quoted_mode = [this] (lexer_mode m) + { + state_.push (state { + m, 0, nullopt, false, false, '\0', false, true, true, + state_.top ().escapes, nullptr, nullptr}); + }; + switch (c) { case '\'': @@ -842,7 +892,7 @@ namespace build2 // Enter the single-quoted mode in case the derived lexer needs // to notice this. // - mode (lexer_mode::single_quoted); + quoted_mode (lexer_mode::single_quoted); switch (qtype) { @@ -881,7 +931,8 @@ namespace build2 { get (); - mode (lexer_mode::double_quoted); + quoted_mode (lexer_mode::double_quoted); + st = state_.top (); m = st.mode; diff --git a/libbuild2/lexer.hxx b/libbuild2/lexer.hxx index 78d35d7..4371206 100644 --- a/libbuild2/lexer.hxx +++ b/libbuild2/lexer.hxx @@ -133,10 +133,23 @@ namespace build2 const path_name& name () const {return name_;} - // Note: sets mode for the next token. The second argument can be used to - // specify the pair separator character (if the mode supports pairs). If - // escapes is not specified, then inherit the current mode's (though a - // mode can also override it). + // Set the lexer mode for the next token or delay this until the end of a + // double-quoted token sequence is encountered. The second argument can be + // used to specify the pair separator character (if the mode supports + // pairs). If escapes is not specified, then inherit the current mode's + // (though a mode can also override it). + // + // Note that there is a common parsing pattern of sensing the language + // construct kind we are about to parse by reading its first token, + // switching to an appropriate lexing mode, and then parsing the rest. The + // problem here is that the first token may start the double-quoted token + // sequence, turning the lexer into the double-quoted mode. In this case + // switching the lexer mode right away would not be a good idea. Thus, + // this function delays the mode switch until the end of the double-quoted + // sequence is encountered. Note, however, that such a delay only works + // properly if the function is called right after the first quoted token + // is read (because any subsequent tokens may end up being parsed in a + // nested mode such as variable or eval; see mode_impl() for details). // virtual void mode (lexer_mode, @@ -153,10 +166,12 @@ namespace build2 state_.top ().lsbrace_unsep = unsep; } - // Expire the current mode early. + // Expire the current mode early or delay this until the end of a + // double-quoted token sequence is encountered (see mode() for details on + // the delay condition and reasoning). // void - expire_mode () {state_.pop ();} + expire_mode (); lexer_mode mode () const {return state_.top ().mode;} @@ -258,6 +273,20 @@ namespace build2 pair skip_spaces (); + // Set state for the next token or delay until the end of a double-quoted + // token sequence is encountered (see mode() for details on the delay + // condition and reasoning). + // + void + mode_impl (state&&); + + state& + current_state () + { + assert (!state_.empty ()); + return state_.top (); + } + // Diagnostics. // protected: @@ -286,11 +315,14 @@ namespace build2 } const path_name& name_; - std::stack state_; bool sep_; // True if we skipped spaces in peek(). private: + // Use current_state(), mode_impl(), and expire_mode(). + // + std::stack state_; + using base = char_scanner; // Buffer for a get()/peek() potential error. diff --git a/libbuild2/script/lexer.cxx b/libbuild2/script/lexer.cxx index 7577149..e13bbdb 100644 --- a/libbuild2/script/lexer.cxx +++ b/libbuild2/script/lexer.cxx @@ -24,10 +24,7 @@ namespace build2 bool q (true); // quotes if (!esc) - { - assert (!state_.empty ()); - esc = state_.top ().escapes; - } + esc = current_state ().escapes; switch (m) { @@ -84,7 +81,7 @@ namespace build2 } assert (ps == '\0'); - state_.push ( + mode_impl ( state {m, data, nullopt, false, false, ps, s, n, q, *esc, s1, s2}); } @@ -93,7 +90,7 @@ namespace build2 { token r; - switch (state_.top ().mode) + switch (mode ()) { case lexer_mode::command_expansion: case lexer_mode::here_line_single: @@ -119,7 +116,7 @@ namespace build2 xchar c (get ()); uint64_t ln (c.line), cn (c.column); - const state& st (state_.top ()); + const state& st (current_state ()); lexer_mode m (st.mode); auto make_token = [&sep, &m, ln, cn] (type t) diff --git a/libbuild2/script/lexer.hxx b/libbuild2/script/lexer.hxx index dbfdfcc..3cbcc03 100644 --- a/libbuild2/script/lexer.hxx +++ b/libbuild2/script/lexer.hxx @@ -112,6 +112,8 @@ namespace build2 const redirect_aliases_type& redirect_aliases; protected: + using build2::lexer::mode; // Getter. + lexer (istream& is, const path_name& name, uint64_t line, const char* escapes, bool set_mode, diff --git a/libbuild2/test/script/lexer.cxx b/libbuild2/test/script/lexer.cxx index 9475ad4..b470d25 100644 --- a/libbuild2/test/script/lexer.cxx +++ b/libbuild2/test/script/lexer.cxx @@ -34,10 +34,7 @@ namespace build2 bool q (true); // quotes if (!esc) - { - assert (!state_.empty ()); - esc = state_.top ().escapes; - } + esc = current_state ().escapes; switch (m) { @@ -113,7 +110,7 @@ namespace build2 } assert (ps == '\0'); - state_.push ( + mode_impl ( state {m, data, nullopt, false, false, ps, s, n, q, *esc, s1, s2}); } @@ -122,7 +119,7 @@ namespace build2 { token r; - switch (state_.top ().mode) + switch (mode ()) { case lexer_mode::command_line: case lexer_mode::first_token: @@ -151,7 +148,7 @@ namespace build2 xchar c (get ()); uint64_t ln (c.line), cn (c.column); - state st (state_.top ()); // Make copy (see first/second_token). + state st (current_state ()); // Make copy (see first/second_token). lexer_mode m (st.mode); auto make_token = [&sep, ln, cn] (type t) @@ -167,7 +164,7 @@ namespace build2 assert (m == lexer_mode::variable_line || m == lexer_mode::for_loop); - state_.top ().lsbrace = false; // Note: st is a copy. + current_state ().lsbrace = false; // Note: st is a copy. if (c == '[' && (!st.lsbrace_unsep || !sep)) return make_token (type::lsbrace); @@ -180,7 +177,7 @@ namespace build2 // we push any new mode (e.g., double quote). // if (m == lexer_mode::first_token || m == lexer_mode::second_token) - state_.pop (); + expire_mode (); // NOTE: remember to update mode() if adding new special characters. @@ -191,7 +188,7 @@ namespace build2 // Expire variable value mode at the end of the line. // if (m == lexer_mode::variable_line) - state_.pop (); + expire_mode (); sep = true; // Treat newline as always separated. return make_token (type::newline); @@ -322,7 +319,7 @@ namespace build2 if (c == '\n') { get (); - state_.pop (); // Expire the description mode. + expire_mode (); // Expire the description mode. return token (type::newline, true, ln, cn, token_printer); } diff --git a/libbuild2/test/script/lexer.hxx b/libbuild2/test/script/lexer.hxx index def269b..993a9db 100644 --- a/libbuild2/test/script/lexer.hxx +++ b/libbuild2/test/script/lexer.hxx @@ -68,6 +68,8 @@ namespace build2 static redirect_aliases_type redirect_aliases; private: + using build2::script::lexer::mode; // Getter. + token next_line (); diff --git a/libbuild2/test/script/parser+for.test.testscript b/libbuild2/test/script/parser+for.test.testscript index 9d70886..5350f28 100644 --- a/libbuild2/test/script/parser+for.test.testscript +++ b/libbuild2/test/script/parser+for.test.testscript @@ -801,6 +801,21 @@ testscript:1:1: error: for: missing variable name EOE + : quoted-opt + : + $* <>EOO + o = -w + for "$o" x <'a b' + cmd $x + end; + for "($o)" x <'a b' + cmd $x + end + EOI + for -w x <'a b' + for -w x <'a b' + EOO + : untyped : $* <>EOO -- cgit v1.1