aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBoris Kolpackov <boris@codesynthesis.com>2020-05-06 06:58:34 +0200
committerBoris Kolpackov <boris@codesynthesis.com>2020-05-27 08:35:29 +0200
commitfce9782a330e8f701a8df0b5200e5b78e97ec4b5 (patch)
tree725897267dd6b78761dad46b2cae695f27889bf7
parentfa717b42574fe1a7c0c07393f6790595d39c33ae (diff)
Handle multi-curly-brace tokens in lexer
-rw-r--r--libbuild2/lexer+foreign.test.testscript96
-rw-r--r--libbuild2/lexer+normal.test.testscript18
-rw-r--r--libbuild2/lexer.cxx124
-rw-r--r--libbuild2/lexer.hxx25
-rw-r--r--libbuild2/lexer.test.cxx14
-rw-r--r--libbuild2/test/script/lexer.cxx2
-rw-r--r--libbuild2/token.cxx69
-rw-r--r--libbuild2/token.hxx3
8 files changed, 306 insertions, 45 deletions
diff --git a/libbuild2/lexer+foreign.test.testscript b/libbuild2/lexer+foreign.test.testscript
new file mode 100644
index 0000000..94c83c1
--- /dev/null
+++ b/libbuild2/lexer+foreign.test.testscript
@@ -0,0 +1,96 @@
+# file : libbuild2/lexer+foreign.test.testscript
+# license : MIT; see accompanying LICENSE file
+
+test.arguments = foreign=2
+
+: basics
+:
+$* <<EOI >>EOO
+echo foo
+}}
+EOI
+'echo foo
+'
+}}
+<newline>
+EOO
+
+: empty
+:
+$* <<EOI >>EOO
+}}
+EOI
+''
+}}
+<newline>
+EOO
+
+: braces
+:
+$* <<EOI >>EOO
+}
+}}}
+{{}}
+}} }
+}}
+EOI
+'}
+}}}
+{{}}
+}} }
+'
+}}
+<newline>
+EOO
+
+: whitespaces
+:
+$* <' }} ' >>EOO # Note: there are TABs.
+''
+}}
+<newline>
+EOO
+
+: comment
+:
+$* <'}} # comment' >>EOO
+''
+}}
+<newline>
+EOO
+
+: eos
+:
+$* <:'}}' >>EOO
+''
+}}
+EOO
+
+: missing
+: Note that we get eos right away (i.e., there is no word token).
+:
+$* <<EOI
+}
+}}}
+{{}}
+}} }
+}
+EOI
+
+: three
+:
+{
+ test.arguments = foreign=3
+
+ : basic
+ :
+ $* <<EOI >>EOO
+ echo foo
+ }}}
+ EOI
+ ' echo foo
+ '
+ }}}
+ <newline>
+ EOO
+}
diff --git a/libbuild2/lexer+normal.test.testscript b/libbuild2/lexer+normal.test.testscript
index e66b81e..e2780a2 100644
--- a/libbuild2/lexer+normal.test.testscript
+++ b/libbuild2/lexer+normal.test.testscript
@@ -70,3 +70,21 @@ EOO
'x%'
EOO
}
+
+: multi-lcbrace
+: Leading multi-curly-brace recognition.
+:
+{
+ : two
+ :
+ $* <:'{{' >>EOO
+ {{
+ EOO
+
+ : three
+ :
+ $* <:'{{{c++' >>EOO
+ {{{
+ 'c++'
+ EOO
+}
diff --git a/libbuild2/lexer.cxx b/libbuild2/lexer.cxx
index e970437..6d3504c 100644
--- a/libbuild2/lexer.cxx
+++ b/libbuild2/lexer.cxx
@@ -128,10 +128,16 @@ namespace build2
n = false;
break;
}
+ case lexer_mode::foreign:
+ assert (data > 1);
+ // Fall through.
case lexer_mode::single_quoted:
case lexer_mode::double_quoted:
- s = false;
- // Fall through.
+ {
+ assert (ps == '\0');
+ s = false;
+ break;
+ }
case lexer_mode::variable:
{
// These are handled in an ad hoc way in word().
@@ -141,7 +147,7 @@ namespace build2
default: assert (false); // Unhandled custom mode.
}
- state_.push (state {m, data, a, ps, s, n, q, *esc, s1, s2});
+ state_.push (state {m, data, nullopt, a, ps, s, n, q, *esc, s1, s2});
}
token lexer::
@@ -166,6 +172,7 @@ namespace build2
case lexer_mode::buildspec: break;
case lexer_mode::eval: return next_eval ();
case lexer_mode::double_quoted: return next_quoted ();
+ case lexer_mode::foreign: return next_foreign ();
default: assert (false); // Unhandled custom mode.
}
@@ -241,11 +248,29 @@ namespace build2
}
}
+ // Line-leading tokens in the normal mode.
+ //
+ // Note: must come before any other (e.g., `{`) tests below.
+ //
if (m == lexer_mode::normal && first)
{
switch (c)
{
case '%': return make_token (type::percent);
+ case '{':
+ {
+ string v;
+ while (peek () == '{')
+ v += get ();
+
+ if (!v.empty ())
+ {
+ v += '{';
+ return make_token (type::multi_lcbrace, move (v));
+ }
+
+ break;
+ }
}
}
@@ -507,6 +532,99 @@ namespace build2
}
token lexer::
+ next_foreign ()
+ {
+ state& st (state_.top ());
+
+ if (st.hold)
+ {
+ token r (move (*st.hold));
+ state_.pop (); // Expire foreign mode.
+ return r;
+ }
+
+ auto count (state_.top ().data); // Number of closing braces to expect.
+
+ xchar c (get ()); // First character of first line after `{{...`.
+ uint64_t ln (c.line), cn (c.column);
+
+ string lexeme;
+ for (bool first (true); !eos (c); c = get ())
+ {
+ // If this is the first character of a line, recognize closing braces.
+ //
+ if (first)
+ {
+ first = false;
+
+ // If this turns not to be the closing braces, we need to add any
+ // characters we have extracted to lexeme. Instead of saving these
+ // characters in a temporary we speculatively add them to the lexeme
+ // but then chop them off if this turned out to be the closing braces.
+ //
+ size_t chop (lexeme.size ());
+
+ // Skip leading whitespaces, if any.
+ //
+ for (; c == ' ' || c == '\t'; c = get ())
+ lexeme += c;
+
+ uint64_t bln (c.line), bcn (c.column); // Position of first `}`.
+
+ // Count braces.
+ //
+ auto i (count);
+ for (; c == '}'; c = get ())
+ {
+ lexeme += c;
+
+ if (--i == 0)
+ break;
+ }
+
+ if (i == 0) // Got enough braces.
+ {
+ // Make sure there are only whitespaces/comments after. Note that
+ // now we must start peeking since newline is not "ours".
+ //
+ for (c = peek (); c == ' ' || c == '\t'; c = peek ())
+ lexeme += get ();
+
+ if (c == '\n' || c == '#' || eos (c))
+ {
+ st.hold = token (type::multi_rcbrace,
+ string (count, '}'),
+ false, quote_type::unquoted, false,
+ bln, bcn,
+ token_printer);
+
+ lexeme.resize (chop);
+ return token (move (lexeme),
+ false, quote_type::unquoted, false,
+ ln, cn);
+ }
+
+ get (); // And fall through (not eos).
+ }
+ else
+ {
+ if (eos (c))
+ break;
+
+ // Fall through.
+ }
+ }
+
+ if (c == '\n')
+ first = true;
+
+ lexeme += c;
+ }
+
+ return token (type::eos, false, c.line, c.column, token_printer);
+ }
+
+ token lexer::
word (state st, bool sep)
{
lexer_mode m (st.mode);
diff --git a/libbuild2/lexer.hxx b/libbuild2/lexer.hxx
index 6dc5027..8dd58c8 100644
--- a/libbuild2/lexer.hxx
+++ b/libbuild2/lexer.hxx
@@ -43,13 +43,22 @@ namespace build2
// split words separated by the pair character (to disable pairs one can
// pass `\0` as a pair character).
//
- // The normal mode recognizes `%` at the beginning of the line as special.
- // The cmdvar mode is like normal but does not treat `%` as special.
+ // The normal mode recognizes `%` and `{{...` at the beginning of the line
+ // as special. The cmdvar mode is like normal but does not treat these
+ // character sequences as special.
+ //
+ // Finally, the foreign mode reads everything until encountering a line that
+ // contains nothing (besides whitespaces) other than the closing multi-
+ // curly-brace (`}}...`) (or eos) returning the contents as the word token
+ // followed by the multi_rcbrace (or eos). In a way it is similar to the
+ // single-quote mode. The number of closing braces to expect is passed as
+ // mode data.
//
// The alternative modes must be set manually. The value/values and derived
// modes automatically expires after the end of the line. The attribute mode
// expires after the closing `]`. The variable mode expires after the word
- // token. And the eval mode expires after the closing `)`.
+ // token. The eval mode expires after the closing `)`. And the foreign mode
+ // expires after the closing braces.
//
// Note that normally it is only safe to switch mode when the current token
// is not quoted (or, more generally, when you are not in the double-quoted
@@ -85,6 +94,7 @@ namespace build2
eval,
single_quoted,
double_quoted,
+ foreign,
buildspec,
value_next
@@ -163,8 +173,10 @@ namespace build2
protected:
struct state
{
- lexer_mode mode;
- uintptr_t data;
+ lexer_mode mode;
+ uintptr_t data;
+ optional<token> hold;
+
bool attributes;
char sep_pair;
@@ -190,6 +202,9 @@ namespace build2
token
next_quoted ();
+ token
+ next_foreign ();
+
// Lex a word assuming current is the top state (which may already have
// been "expired" from the top).
//
diff --git a/libbuild2/lexer.test.cxx b/libbuild2/lexer.test.cxx
index 5e39e43..3458f56 100644
--- a/libbuild2/lexer.test.cxx
+++ b/libbuild2/lexer.test.cxx
@@ -1,6 +1,7 @@
// file : libbuild2/lexer.test.cxx -*- C++ -*-
// license : MIT; see accompanying LICENSE file
+#include <cstdlib> // strtoul()
#include <cassert>
#include <iostream>
@@ -14,13 +15,15 @@ using namespace std;
namespace build2
{
- // Usage: argv[0] [-q] [<lexer-mode>]
+ // Usage: argv[0] [-q] [<lexer-mode>[=<data>]]
//
int
main (int argc, char* argv[])
{
bool quote (false);
+
lexer_mode m (lexer_mode::normal);
+ uintptr_t d (0);
for (int i (1); i != argc; ++i)
{
@@ -36,7 +39,12 @@ namespace build2
else if (a == "attributes") m = lexer_mode::attributes;
else if (a == "eval") m = lexer_mode::eval;
else if (a == "buildspec") m = lexer_mode::buildspec;
- else assert (false);
+ else if (a.compare (0, 8, "foreign=") == 0)
+ {
+ m = lexer_mode::foreign;
+ d = strtoul (a.c_str () + 8, nullptr, 10);
+ }
+ else assert (false);
break;
}
}
@@ -51,7 +59,7 @@ namespace build2
lexer l (cin, in);
if (m != lexer_mode::normal)
- l.mode (m);
+ l.mode (m, '\0', nullopt, d);
// No use printing eos since we will either get it or loop forever.
//
diff --git a/libbuild2/test/script/lexer.cxx b/libbuild2/test/script/lexer.cxx
index 1eeb0be..32c1cf4 100644
--- a/libbuild2/test/script/lexer.cxx
+++ b/libbuild2/test/script/lexer.cxx
@@ -138,7 +138,7 @@ namespace build2
}
assert (ps == '\0');
- state_.push (state {m, data, a, ps, s, n, q, *esc, s1, s2});
+ state_.push (state {m, data, nullopt, a, ps, s, n, q, *esc, s1, s2});
}
token lexer::
diff --git a/libbuild2/token.cxx b/libbuild2/token.cxx
index 11b080e..cfdc6bd 100644
--- a/libbuild2/token.cxx
+++ b/libbuild2/token.cxx
@@ -21,39 +21,42 @@ namespace build2
case token_type::pair_separator: os << "<pair separator " << t.value[0] << ">"; break;
case token_type::word: os << '\'' << t.value << '\''; break;
- case token_type::colon: os << q << ':' << q; break;
- case token_type::dollar: os << q << '$' << q; break;
- case token_type::question: os << q << '?' << q; break;
- case token_type::percent: os << q << '%' << q; break;
- case token_type::comma: os << q << ',' << q; break;
-
- case token_type::lparen: os << q << '(' << q; break;
- case token_type::rparen: os << q << ')' << q; break;
-
- case token_type::lcbrace: os << q << '{' << q; break;
- case token_type::rcbrace: os << q << '}' << q; break;
-
- case token_type::lsbrace: os << q << '[' << q; break;
- case token_type::rsbrace: os << q << ']' << q; break;
-
- case token_type::labrace: os << q << '<' << q; break;
- case token_type::rabrace: os << q << '>' << q; break;
-
- case token_type::assign: os << q << '=' << q; break;
- case token_type::prepend: os << q << "=+" << q; break;
- case token_type::append: os << q << "+=" << q; break;
- case token_type::default_assign: os << q << "?=" << q; break;
-
- case token_type::equal: os << q << "==" << q; break;
- case token_type::not_equal: os << q << "!=" << q; break;
- case token_type::less: os << q << '<' << q; break;
- case token_type::greater: os << q << '>' << q; break;
- case token_type::less_equal: os << q << "<=" << q; break;
- case token_type::greater_equal: os << q << ">=" << q; break;
-
- case token_type::log_or: os << q << "||" << q; break;
- case token_type::log_and: os << q << "&&" << q; break;
- case token_type::log_not: os << q << '!' << q; break;
+ case token_type::colon: os << q << ':' << q; break;
+ case token_type::dollar: os << q << '$' << q; break;
+ case token_type::question: os << q << '?' << q; break;
+ case token_type::percent: os << q << '%' << q; break;
+ case token_type::comma: os << q << ',' << q; break;
+
+ case token_type::lparen: os << q << '(' << q; break;
+ case token_type::rparen: os << q << ')' << q; break;
+
+ case token_type::lcbrace: os << q << '{' << q; break;
+ case token_type::rcbrace: os << q << '}' << q; break;
+
+ case token_type::multi_lcbrace: os << q << t.value << q; break;
+ case token_type::multi_rcbrace: os << q << t.value << q; break;
+
+ case token_type::lsbrace: os << q << '[' << q; break;
+ case token_type::rsbrace: os << q << ']' << q; break;
+
+ case token_type::labrace: os << q << '<' << q; break;
+ case token_type::rabrace: os << q << '>' << q; break;
+
+ case token_type::assign: os << q << '=' << q; break;
+ case token_type::prepend: os << q << "=+" << q; break;
+ case token_type::append: os << q << "+=" << q; break;
+ case token_type::default_assign: os << q << "?=" << q; break;
+
+ case token_type::equal: os << q << "==" << q; break;
+ case token_type::not_equal: os << q << "!=" << q; break;
+ case token_type::less: os << q << '<' << q; break;
+ case token_type::greater: os << q << '>' << q; break;
+ case token_type::less_equal: os << q << "<=" << q; break;
+ case token_type::greater_equal: os << q << ">=" << q; break;
+
+ case token_type::log_or: os << q << "||" << q; break;
+ case token_type::log_and: os << q << "&&" << q; break;
+ case token_type::log_not: os << q << '!' << q; break;
default: assert (false); // Unhandled extended token.
}
diff --git a/libbuild2/token.hxx b/libbuild2/token.hxx
index 8dad4ba..e11b880 100644
--- a/libbuild2/token.hxx
+++ b/libbuild2/token.hxx
@@ -45,6 +45,9 @@ namespace build2
lcbrace, // {
rcbrace, // }
+ multi_lcbrace, // {{... (value contains the braces)
+ multi_rcbrace, // }}... (value contains the braces)
+
lsbrace, // [
rsbrace, // ]