From 72648921ec28903615698a61aeff4799e1ca9a7d Mon Sep 17 00:00:00 2001 From: Boris Kolpackov Date: Tue, 9 Jun 2015 19:50:58 +0200 Subject: Implement low-level manifest parser --- bpkg/buildfile | 5 + bpkg/manifest-parser | 141 ++++++++++++ bpkg/manifest-parser.cxx | 453 +++++++++++++++++++++++++++++++++++++++ build/.gitignore | 1 + build/bootstrap.build | 6 + build/root.build | 8 + buildfile | 7 + tests/.gitignore | 1 + tests/buildfile | 7 + tests/manifest-parser/buildfile | 7 + tests/manifest-parser/driver.cxx | 210 ++++++++++++++++++ 11 files changed, 846 insertions(+) create mode 100644 bpkg/buildfile create mode 100644 bpkg/manifest-parser create mode 100644 bpkg/manifest-parser.cxx create mode 100644 build/.gitignore create mode 100644 build/bootstrap.build create mode 100644 build/root.build create mode 100644 buildfile create mode 100644 tests/.gitignore create mode 100644 tests/buildfile create mode 100644 tests/manifest-parser/buildfile create mode 100644 tests/manifest-parser/driver.cxx diff --git a/bpkg/buildfile b/bpkg/buildfile new file mode 100644 index 0000000..b7c1c9f --- /dev/null +++ b/bpkg/buildfile @@ -0,0 +1,5 @@ +# file : bpkg/buildfile +# copyright : Copyright (c) 2014-2015 Code Synthesis Ltd +# license : MIT; see accompanying LICENSE file + +lib{bpkg}: cxx{manifest-parser} diff --git a/bpkg/manifest-parser b/bpkg/manifest-parser new file mode 100644 index 0000000..2b1e4a5 --- /dev/null +++ b/bpkg/manifest-parser @@ -0,0 +1,141 @@ +// file : bpkg/manifest-parser -*- C++ -*- +// copyright : Copyright (c) 2014-2015 Code Synthesis Ltd +// license : MIT; see accompanying LICENSE file + +#ifndef BPKG_MANIFEST_PARSER +#define BPKG_MANIFEST_PARSER + +#include +#include +#include // uint64_t +#include // runtime_error + +namespace bpkg +{ + class manifest_parsing: public std::runtime_error + { + public: + manifest_parsing (const std::string& name, + std::uint64_t line, + std::uint64_t column, + const std::string& description); + + std::string name; + std::uint64_t line; + std::uint64_t column; + std::string description; + }; + + class manifest_parser + { + public: + manifest_parser (std::istream& is, const std::string& name) + : is_ (is), name_ (name) {} + + const std::string& + name () const {return name_;} + + struct name_value_type + { + std::string name; + std::string value; + + std::uint64_t name_line; + std::uint64_t name_column; + + std::uint64_t value_line; + std::uint64_t value_column; + }; + + // The first returned pair is special "start-of-manifest" with + // empty name and value being the format version: {"", ""}. + // After that we have a sequence of ordinary pairs which are + // the manifest. At the end of the manifest we have the special + // "end-of-manifest" pair with empty name and value: {"", ""}. + // After that we can either get another start-of-manifest pair + // (in which case the whole sequence repeats from the beginning) + // or we get another end-of-manifest pair which signals the end + // of stream (aka EOF). To put it another way, the parse sequence + // always has the following form: + // + // ({"", ""} {"", ""}* {"", ""})* {"", ""} + // + name_value_type + next (); + + private: + class xchar + { + public: + typedef std::char_traits traits_type; + typedef traits_type::int_type int_type; + typedef traits_type::char_type char_type; + + xchar (int_type v, std::uint64_t l, std::uint64_t c) + : v_ (v), l_ (l), c_ (c) {} + + operator char_type () const {return static_cast (v_);} + + int_type + value () const {return v_;} + + std::uint64_t line () const {return l_;} + std::uint64_t column () const {return c_;} + + private: + int_type v_; + std::uint64_t l_; + std::uint64_t c_; + }; + + private: + void + parse_name (name_value_type&); + + void + parse_value (name_value_type&); + + // Skip spaces and return the first peeked non-space character. + // + xchar + skip_spaces (); + + // Character interface. + // + private: + xchar + peek (); + + xchar + get (); + + void + unget (const xchar&); + + // Tests. + // + bool + is_eos (const xchar& c) const + { + return c.value () == xchar::traits_type::eof (); + } + + private: + enum {start, body, eos} s_ = start; + std::string version_; // Current format version. + + private: + std::istream& is_; + const std::string name_; + + std::uint64_t l_ {1}; + std::uint64_t c_ {1}; + + bool unget_ {false}; + xchar buf_ {0, 0, 0}; + + bool eos_ {false}; + }; +} + +#endif // BPKG_MANIFEST_PARSER diff --git a/bpkg/manifest-parser.cxx b/bpkg/manifest-parser.cxx new file mode 100644 index 0000000..374de3e --- /dev/null +++ b/bpkg/manifest-parser.cxx @@ -0,0 +1,453 @@ +// file : bpkg/manifest-parser.cxx -*- C++ -*- +// copyright : Copyright (c) 2014-2015 Code Synthesis Ltd +// license : MIT; see accompanying LICENSE file + +#include + +#include +#include + +using namespace std; + +namespace bpkg +{ + using name_value = manifest_parser::name_value_type; + + name_value manifest_parser:: + next () + { + if (s_ == eos) + return name_value {"", "", l_, c_, l_, c_}; + + xchar c (skip_spaces ()); + + // Here is the problem: if we are in the 'body' state (that is, + // we are parsing inside the manifest) and we see the special + // empty name, then before returning the "start" pair for the + // next manifest, we have to return the "end" pair. One way + // would be to cache the "start" pair and return it on the + // next call of next(). But that would require quite a bit + // of extra logic. The alternative is to detect the beginning + // of the empty name before parsing too far. This way, the + // next call to next() will start parsing where we left of + // and return the "start" pair naturally. + // + if (s_ == body && c == ':') + { + s_ = start; + uint64_t ln (c.line ()), cn (c.column ()); + return name_value {"", "", ln, cn, ln, cn}; + } + + // Regardless of the state, what should come next is a name, + // potentially the special empty one. + // + name_value r; + parse_name (r); + + skip_spaces (); + c = get (); + + if (is_eos (c)) + { + // This is ok as long as the name is empty. + // + if (!r.name.empty ()) + throw manifest_parsing (name_, c.line (), c.column (), + "':' expected after name"); + + s_ = eos; + + // The "end" pair. + // + r.value_line = r.name_line; + r.value_column = r.name_column; + return r; + } + + if (c != ':') + throw manifest_parsing (name_, c.line (), c.column (), + "':' expected after name"); + + skip_spaces (); + parse_value (r); + + c = peek (); + + // The character after the value should be either a newline or eos. + // + assert (c == '\n' || is_eos (c)); + + if (c == '\n') + get (); + + // Now figure out whether what we've got makes sense, depending + // on the state we are in. + // + if (s_ == start) + { + // Start of the (next) manifest. The first pair should be the + // special empty name/format version. + // + if (!r.name.empty ()) + throw manifest_parsing (name_, r.name_line, r.name_column, + "format version pair expected"); + + // The version value is only mandatory for the first manifest in + // a sequence. + // + if (r.value.empty ()) + { + if (version_.empty ()) + throw manifest_parsing (name_, r.value_line, r.value_column, + "format version value expected"); + r.value = version_; + } + else + { + version_ = r.value; // Update with the latest. + + if (version_ != "1") + throw manifest_parsing (name_, r.value_line, r.value_column, + "unsupported format version " + version_); + } + + s_ = body; + } + else + { + // Parsing the body of the manifest. + // + + // Should have been handled by the special case above. + // + assert (!r.name.empty ()); + } + + return r; + } + + void manifest_parser:: + parse_name (name_value& r) + { + xchar c (peek ()); + + r.name_line = c.line (); + r.name_column = c.column (); + + for (; !is_eos (c); c = peek ()) + { + if (c == ':' || c == ' ' || c == '\t' || c == '\n') + break; + + r.name += c; + get (); + } + } + + void manifest_parser:: + parse_value (name_value& r) + { + xchar c (peek ()); + + r.value_line = c.line (); + r.value_column = c.column (); + + string& v (r.value); + string::size_type n (0); // Size of last non-space character (simpel mode). + + // Detect the multi-line mode introductor. + // + bool ml (false); + if (c == '\\') + { + get (); + xchar p (peek ()); + + if (p == '\n') + { + get (); // Newline is not part of the value so skip it. + c = peek (); + ml = true; + } + else if (is_eos (p)) + ml = true; + else + unget (c); + } + + // The nl flag signals that the preceding character was a "special + // newline", that is, a newline that was part of the milti-line mode + // introductor or an escape sequence. + // + for (bool nl (ml); !is_eos (c); c = peek ()) + { + // Detect the special "\n\\\n" sequence. In the multi-line mode, + // this is a "terminator". In the simple mode, this is a way to + // specify a newline. + // + // The key idea here is this: if we "swallowed" any characters + // (i.e., called get() without a matching unget()), then we + // have to restart the loop in order to do all the tests for + // the next character. Also, for this to work, we can only + // add one character to v, which limits us to maximum three + // characters look-ahead: one in v, one "ungot", and one + // peeked. + // + // The first block handles the special sequence that starts with + // a special newline. In multi-line mode, this is an "immediate + // termination" where we "use" the newline from the introductor. + // Note also that in the simple mode the special sequence can + // only start with a special (i.e., escaped) newline. + // + if (nl) + { + nl = false; + + if (c == '\\') + { + get (); + xchar c1 (peek ()); + + if (c1 == '\n' || is_eos (c1)) + { + if (ml) + break; + else + { + if (c1 == '\n') + get (); + + v += '\n'; // Literal newline. + n++; + continue; // Restart from the next character. + } + } + else + unget (c); // Fall through. + } + } + + if (c == '\n') + { + if (ml) + { + get (); + xchar c1 (peek ()); + + if (c1 == '\\') + { + get (); + xchar c2 (peek ()); + + if (c2 == '\n' || is_eos (c2)) + break; + else + { + v += '\n'; + unget (c1); + continue; // Restart from c1 (slash). + } + } + else + unget (c); // Fall through. + } + else + break; // Simple value terminator. + } + + // Detect the newline escape sequence. The same look-ahead + // approach as above. + // + if (c == '\\') + { + get (); + xchar c1 (peek ()); + + if (c1 == '\n' || is_eos (c1)) + { + if (c1 == '\n') + { + get (); + nl = true; // This is a special newline. + } + continue; // Restart from the next character. + } + else if (c1 == '\\') + { + get (); + xchar c2 (peek ()); + + if (c2 == '\n' || is_eos (c1)) + { + v += '\\'; + n++; + // Restart from c2 (newline/eos). + } + else + { + v += '\\'; + n++; + unget (c1); // Restart from c1 (second slash). + } + + continue; + } + else + unget (c); // Fall through. + } + + get (); + v += c; + + if (!ml) + { + if (c != ' ' && c != '\t') + n++; + } + } + + // Cut off trailing whitespaces. + // + if (!ml) + v.resize (n); + } + + manifest_parser::xchar manifest_parser:: + skip_spaces () + { + xchar c (peek ()); + bool start (c.column () == 1); + + for (; !is_eos (c); c = peek ()) + { + switch (c) + { + case ' ': + case '\t': + break; + case '\n': + { + // Skip empty lines. + // + if (!start) + return c; + + break; + } + case '#': + { + // We only recognize '#' as a start of a comment at the beginning + // of the line (sans leading spaces). + // + if (!start) + return c; + + get (); + + // Read until newline or eos. + // + for (c = peek (); !is_eos (c) && c != '\n'; c = peek ()) + get (); + + continue; + } + default: + return c; // Not a space. + } + + get (); + } + + return c; + } + + // Character interface. + // + + manifest_parser::xchar manifest_parser:: + peek () + { + if (unget_) + return buf_; + else + { + if (eos_) + return xchar (xchar::traits_type::eof (), l_, c_); + else + { + xchar::int_type v (is_.peek ()); + + if (v == xchar::traits_type::eof ()) + eos_ = true; + + return xchar (v, l_, c_); + } + } + } + + manifest_parser::xchar manifest_parser:: + get () + { + if (unget_) + { + unget_ = false; + return buf_; + } + else + { + // When is_.get () returns eof, the failbit is also set (stupid, + // isn't?) which may trigger an exception. To work around this + // we will call peek() first and only call get() if it is not + // eof. But we can only call peek() on eof once; any subsequent + // calls will spoil the failbit (even more stupid). + // + xchar c (peek ()); + + if (!is_eos (c)) + { + is_.get (); + + if (c == '\n') + { + l_++; + c_ = 1; + } + else + c_++; + } + + return c; + } + } + + void manifest_parser:: + unget (const xchar& c) + { + // Because iostream::unget cannot work once eos is reached, + // we have to provide our own implementation. + // + buf_ = c; + unget_ = true; + } + + // manifest_parsing + // + + static string + format (const string& n, uint64_t l, uint64_t c, const string& d) + { + ostringstream os; + if (!n.empty ()) + os << n << ':'; + os << l << ':' << c << ": error: " << d; + return os.str (); + } + + manifest_parsing:: + manifest_parsing (const string& n, uint64_t l, uint64_t c, const string& d) + : runtime_error (format (n, l, c, d)), + name (n), line (l), column (c), description (d) + { + } +} diff --git a/build/.gitignore b/build/.gitignore new file mode 100644 index 0000000..225c27f --- /dev/null +++ b/build/.gitignore @@ -0,0 +1 @@ +config.build diff --git a/build/bootstrap.build b/build/bootstrap.build new file mode 100644 index 0000000..0fff8ad --- /dev/null +++ b/build/bootstrap.build @@ -0,0 +1,6 @@ +# file : build/bootstrap.build +# copyright : Copyright (c) 2014-2015 Code Synthesis Ltd +# license : MIT; see accompanying LICENSE file + +project = libbpkg +using config diff --git a/build/root.build b/build/root.build new file mode 100644 index 0000000..dc1c297 --- /dev/null +++ b/build/root.build @@ -0,0 +1,8 @@ +# file : build/root.build +# copyright : Copyright (c) 2014-2015 Code Synthesis Ltd +# license : MIT; see accompanying LICENSE file + +using cxx + +cxx.std = 14 +cxx.poptions += -I$src_root diff --git a/buildfile b/buildfile new file mode 100644 index 0000000..b9c4507 --- /dev/null +++ b/buildfile @@ -0,0 +1,7 @@ +# file : buildfile +# copyright : Copyright (c) 2014-2015 Code Synthesis Ltd +# license : MIT; see accompanying LICENSE file + +d = bpkg/ tests/ +.: $d +include $d diff --git a/tests/.gitignore b/tests/.gitignore new file mode 100644 index 0000000..e54525b --- /dev/null +++ b/tests/.gitignore @@ -0,0 +1 @@ +driver diff --git a/tests/buildfile b/tests/buildfile new file mode 100644 index 0000000..8832040 --- /dev/null +++ b/tests/buildfile @@ -0,0 +1,7 @@ +# file : tests/buildfile +# copyright : Copyright (c) 2014-2015 Code Synthesis Ltd +# license : MIT; see accompanying LICENSE file + +d = manifest-parser/ +.: $d +include $d diff --git a/tests/manifest-parser/buildfile b/tests/manifest-parser/buildfile new file mode 100644 index 0000000..4fcf735 --- /dev/null +++ b/tests/manifest-parser/buildfile @@ -0,0 +1,7 @@ +# file : tests/manifest-parser/buildfile +# copyright : Copyright (c) 2014-2015 Code Synthesis Ltd +# license : MIT; see accompanying LICENSE file + +exe{driver}: cxx{driver} ../../bpkg/lib{bpkg} + +include ../../bpkg/ diff --git a/tests/manifest-parser/driver.cxx b/tests/manifest-parser/driver.cxx new file mode 100644 index 0000000..b86b806 --- /dev/null +++ b/tests/manifest-parser/driver.cxx @@ -0,0 +1,210 @@ +// file : tests/manifest-parser/driver.cxx -*- C++ -*- +// copyright : Copyright (c) 2014-2015 Code Synthesis Ltd +// license : MIT; see accompanying LICENSE file + +#include +#include +#include // pair +#include +#include +#include + +#include + +using namespace std; +using namespace bpkg; + +using pairs = vector>; + +static bool +parse (const char* manifest, const pairs& expected); + +static bool +fail (const char* manifest); + +int +main () +{ + // Whitespaces and comments. + // + assert (parse (" \t", {{"",""}})); + assert (parse (" \t\n \n\n", {{"",""}})); + assert (parse ("# one\n #two", {{"",""}})); + + // Test encountering eos at various points. + // + assert (parse ("", {{"",""}})); + assert (parse (" ", {{"",""}})); + assert (parse ("\n", {{"",""}})); + assert (fail ("a")); + assert (parse (":1\na:", {{"","1"},{"a", ""},{"",""},{"",""}})); + + // Invalid manifests. + // + assert (fail ("a:")); // format version pair expected + assert (fail (":")); // format version value expected + assert (fail (":9")); // unsupported format version + assert (fail ("a")); // ':' expected after name + assert (fail ("a b")); // ':' expected after name + assert (fail ("a\tb")); // ':' expected after name + assert (fail ("a\nb")); // ':' expected after name + assert (fail (":1\na:b\n:9")); // unsupported format version + + // Empty manifest. + // + assert (parse (":1", {{"","1"},{"",""},{"",""}})); + assert (parse (" \t :1", {{"","1"},{"",""},{"",""}})); + assert (parse (" \t : 1", {{"","1"},{"",""},{"",""}})); + assert (parse (" \t : 1 ", {{"","1"},{"",""},{"",""}})); + assert (parse (":1\n", {{"","1"},{"",""},{"",""}})); + assert (parse (":1 \n", {{"","1"},{"",""},{"",""}})); + + // Single manifest. + // + assert (parse (":1\na:x", {{"","1"},{"a", "x"},{"",""},{"",""}})); + assert (parse (":1\na:x\n", {{"","1"},{"a","x"},{"",""},{"",""}})); + assert (parse (":1\na:x\nb:y", + {{"","1"},{"a","x"},{"b","y"},{"",""},{"",""}})); + assert (parse (":1\na:x\n\tb : y\n #comment", + {{"","1"},{"a","x"},{"b","y"},{"",""},{"",""}})); + + // Multiple manifests. + // + assert (parse (":1\na:x\n:\nb:y", + {{"","1"},{"a", "x"},{"",""}, + {"","1"},{"b", "y"},{"",""},{"",""}})); + assert (parse (":1\na:x\n:1\nb:y", + {{"","1"},{"a", "x"},{"",""}, + {"","1"},{"b", "y"},{"",""},{"",""}})); + assert (parse (":1\na:x\n:\nb:y\n:\nc:z\n", + {{"","1"},{"a", "x"},{"",""}, + {"","1"},{"b", "y"},{"",""}, + {"","1"},{"c", "z"},{"",""},{"",""}})); + + // Name parsing. + // + assert (parse (":1\nabc:", {{"","1"},{"abc",""},{"",""},{"",""}})); + assert (parse (":1\nabc :", {{"","1"},{"abc",""},{"",""},{"",""}})); + assert (parse (":1\nabc\t:", {{"","1"},{"abc",""},{"",""},{"",""}})); + + // Simple value parsing. + // + assert (parse (":1\na: \t xyz \t ", {{"","1"},{"a","xyz"},{"",""},{"",""}})); + + // Simple value escaping. + // + assert (parse (":1\na:x\\", {{"","1"},{"a","x"},{"",""},{"",""}})); + assert (parse (":1\na:x\\\ny", {{"","1"},{"a","xy"},{"",""},{"",""}})); + assert (parse (":1\na:x\\\\\nb:", + {{"","1"},{"a","x\\"},{"b",""},{"",""},{"",""}})); + assert (parse (":1\na:x\\\\\\\nb:", + {{"","1"},{"a","x\\\\"},{"b",""},{"",""},{"",""}})); + + // Simple value literal newline. + // + assert (parse (":1\na:x\\\n\\", + {{"","1"},{"a","x\n"},{"",""},{"",""}})); + assert (parse (":1\na:x\\\n\\\ny", + {{"","1"},{"a","x\ny"},{"",""},{"",""}})); + assert (parse (":1\na:x\\\n\\\ny\\\n\\\nz", + {{"","1"},{"a","x\ny\nz"},{"",""},{"",""}})); + + // Multi-line value parsing. + // + assert (parse (":1\na:\\", {{"","1"},{"a", ""},{"",""},{"",""}})); + assert (parse (":1\na:\\\n", {{"","1"},{"a", ""},{"",""},{"",""}})); + assert (parse (":1\na:\\x", {{"","1"},{"a", "\\x"},{"",""},{"",""}})); + assert (parse (":1\na:\\\n\\", {{"","1"},{"a", ""},{"",""},{"",""}})); + assert (parse (":1\na:\\\n\\\n", {{"","1"},{"a", ""},{"",""},{"",""}})); + assert (parse (":1\na:\\\n\\x\n\\", + {{"","1"},{"a", "\\x"},{"",""},{"",""}})); + assert (parse (":1\na:\\\nx\ny", {{"","1"},{"a", "x\ny"},{"",""},{"",""}})); + assert (parse (":1\na:\\\n \n#\t\n\\", + {{"","1"},{"a", " \n#\t"},{"",""},{"",""}})); + assert (parse (":1\na:\\\n\n\n\\", {{"","1"},{"a", "\n"},{"",""},{"",""}})); + + // Multi-line value escaping. + // + assert (parse (":1\na:\\\nx\\", + {{"","1"},{"a","x"},{"",""},{"",""}})); + assert (parse (":1\na:\\\nx\\\ny\n\\", + {{"","1"},{"a","xy"},{"",""},{"",""}})); + assert (parse (":1\na:\\\nx\\\\\n\\\nb:", + {{"","1"},{"a","x\\"},{"b",""},{"",""},{"",""}})); + assert (parse (":1\na:\\\nx\\\\\\\n\\\nb:", + {{"","1"},{"a","x\\\\"},{"b",""},{"",""},{"",""}})); +} + +static std::ostream& +operator<< (std::ostream& os, const pairs& ps) +{ + os << '{'; + + bool f (true); + for (const auto& p: ps) + os << (f ? (f = false, "") : ",") + << '{' << p.first << ',' << p.second << '}'; + + os << '}'; + return os; +} + +static pairs +parse (const char* m) +{ + istringstream is (m); + is.exceptions (istream::failbit | istream::badbit); + manifest_parser p (is, ""); + + pairs r; + + for (bool eom (true), eos (false); !eos; ) + { + auto nv (p.next ()); + + if (nv.name.empty () && nv.value.empty ()) // End pair. + { + eos = eom; + eom = true; + } + else + eom = false; + + r.emplace_back (nv.name, nv.value); // move + } + + return r; +} + +static bool +parse (const char* m, const pairs& e) +{ + pairs r (parse (m)); + + if (r != e) + { + cerr << "actual: " << r << endl + << "expect: " << e << endl; + + return false; + } + + return true; +} + +static bool +fail (const char* m) +{ + try + { + pairs r (parse (m)); + cerr << "nofail: " << r << endl; + return false; + } + catch (const manifest_parsing& e) + { + cerr << e.what () << endl; + } + + return true; +} -- cgit v1.1