diff options
Diffstat (limited to 'libbutl/manifest-parser.cxx')
-rw-r--r-- | libbutl/manifest-parser.cxx | 238 |
1 files changed, 175 insertions, 63 deletions
diff --git a/libbutl/manifest-parser.cxx b/libbutl/manifest-parser.cxx index 9514bbd..904910a 100644 --- a/libbutl/manifest-parser.cxx +++ b/libbutl/manifest-parser.cxx @@ -1,39 +1,10 @@ // file : libbutl/manifest-parser.cxx -*- C++ -*- // license : MIT; see accompanying LICENSE file -#ifndef __cpp_modules_ts -#include <libbutl/manifest-parser.mxx> -#endif +#include <libbutl/manifest-parser.hxx> -#include <cassert> - -#ifndef __cpp_lib_modules_ts #include <string> -#include <vector> -#include <cstdint> -#include <utility> -#include <stdexcept> - -#include <sstream> -#endif - -// Other includes. - -#ifdef __cpp_modules_ts -module butl.manifest_parser; - -// Only imports additional to interface. -#ifdef __clang__ -#ifdef __cpp_lib_modules_ts -import std.core; -import std.io; -#endif -import butl.optional; -import butl.char_scanner; -import butl.manifest_types; -#endif - -#endif +#include <cassert> using namespace std; @@ -177,41 +148,136 @@ namespace butl { using iterator = string::const_iterator; - auto space = [] (char c) -> bool {return c == ' ' || c == '\t';}; + // Parse the value differently depending on whether it is multi-line or + // not. + // + if (v.find ('\n') == string::npos) // Single-line. + { + auto space = [] (char c) {return c == ' ' || c == '\t';}; - iterator i (v.begin ()); - iterator e (v.end ()); + iterator i (v.begin ()); + iterator e (v.end ()); - string r; - size_t n (0); - for (char c; i != e && (c = *i) != ';'; ++i) - { - // Unescape ';' character. + string r; + size_t n (0); + for (char c; i != e && (c = *i) != ';'; ++i) + { + // Unescape ';' and '\' characters. + // + if (c == '\\' && i + 1 != e && (*(i + 1) == ';' || *(i + 1) == '\\')) + c = *++i; + + r += c; + + if (!space (c)) + n = r.size (); + } + + // Strip the value trailing spaces. // - if (c == '\\' && i + 1 != e && *(i + 1) == ';') - c = *++i; + if (r.size () != n) + r.resize (n); - r += c; + // Find beginning of a comment (i). + // + if (i != e) + { + // Skip spaces. + // + for (++i; i != e && space (*i); ++i); + } - if (!space (c)) - n = r.size (); + return make_pair (move (r), string (i, e)); } + else // Multi-line. + { + string r; + string c; - // Strip the value trailing spaces. - // - if (r.size () != n) - r.resize (n); + // Parse the value lines until the comment separator is encountered or + // the end of the value is reached. Add these lines to the resulting + // value, unescaping them if required. + // + // Note that we only need to unescape lines which have the '\+;' form. + // + auto i (v.begin ()); + auto e (v.end ()); - // Find beginning of a comment (i). - // - if (i != e) - { - // Skip spaces. + while (i != e) + { + // Find the end of the line and while at it the first non-backslash + // character. + // + auto le (i); + auto nb (e); + for (; le != e && *le != '\n'; ++le) + { + if (nb == e && *le != '\\') + nb = le; + } + + // If the value end is not reached then position to the beginning of + // the next line and to the end of the value otherwise. + // + auto next = [&i, &le, &e] () {i = (le != e ? le + 1 : e);}; + + // If the first non-backslash character is ';' and it is the last + // character on the line, then this is either the comment separator or + // an escape sequence. + // + if (nb != e && *nb == ';' && nb + 1 == le) + { + // If ';' is the first (and thus the only) character on the line, + // then this is the comment separator and we bail out from this + // loop. Note that in this case we need to trim the trailing newline + // (but only one) from the resulting value since it is considered as + // a part of the separator. + // + if (nb == i) + { + if (!r.empty ()) + { + assert (r.back () == '\n'); + r.pop_back (); + } + + next (); + break; + } + // + // Otherwise, this is an escape sequence, so unescape it. For that + // just take the rightmost half of the string: + // + // \; -> ; + // \\; -> \; + // \\\; -> \; + // \\\\; -> \\; + // \\\\\; -> \\; + // + else + i += (le - i) / 2; + } + + // Add the line to the resulting value together with the trailing + // newline, if present. + // + r.append (i, le); + + if (le != e) + r += '\n'; + + next (); + } + + // If we haven't reached the end of the value then it means we've + // encountered the comment separator. In this case save the remaining + // value part as a comment. // - for (++i; i != e && space (*i); ++i); - } + if (i != e) + c = string (i, e); - return make_pair (move (r), string (i, e)); + return make_pair (move (r), move (c)); + } } void manifest_parser:: @@ -251,7 +317,8 @@ namespace butl string& v (r.value); string::size_type n (0); // Size of last non-space character (simple mode). - // Detect the multi-line mode introductor. + // Detect the old-fashioned multi-line mode introducer (like in + // 'foo:\<newline>'). // bool ml (false); if (c == '\\') @@ -266,11 +333,46 @@ namespace butl ml = true; } else if (eos (p)) + { + c = p; // Set to EOF. ml = true; + } else unget (c); } + // Detect the new-fashioned multi-line mode introducer (like in + // 'foo:<newline>\<newline>'). + // + if (!ml && c == '\n') + { + get (); + xchar p1 (peek ()); + + if (p1 == '\\') + { + get (); + xchar p2 (peek ()); + + if (p2 == '\n') + { + get (); // Newline is not part of the value so skip it. + c = peek (); + ml = true; + } + else if (eos (p2)) + { + c = p2; // Set to EOF. + ml = true; + } + else + unget (p1); // Unget '\\'. Note: '\n' will be ungot below. + } + + if (!ml) + unget (c); // Unget '\n'. + } + // Multi-line value starts from the line that follows the name. // if (ml) @@ -281,7 +383,7 @@ namespace butl // The nl flag signals that the preceding character was a "special // newline", that is, a newline that was part of the milti-line mode - // introductor or an escape sequence. + // introducer or an escape sequence. // for (bool nl (ml); !eos (c); c = peek ()) { @@ -299,7 +401,7 @@ namespace butl // // The first block handles the special sequence that starts with // a special newline. In multi-line mode, this is an "immediate - // termination" where we "use" the newline from the introductor. + // termination" where we "use" the newline from the introducer. // Note also that in the simple mode the special sequence can // only start with a special (i.e., escaped) newline. // @@ -472,11 +574,21 @@ namespace butl static inline string format (const string& n, uint64_t l, uint64_t c, const string& d) { - ostringstream os; + using std::to_string; + + string r; if (!n.empty ()) - os << n << ':'; - os << l << ':' << c << ": error: " << d; - return os.str (); + { + r += n; + r += ':'; + } + + r += to_string (l); + r += ':'; + r += to_string (c); + r += ": error: "; + r += d; + return r; } manifest_parsing:: |