From 72648921ec28903615698a61aeff4799e1ca9a7d Mon Sep 17 00:00:00 2001 From: Boris Kolpackov Date: Tue, 9 Jun 2015 19:50:58 +0200 Subject: Implement low-level manifest parser --- bpkg/buildfile | 5 + bpkg/manifest-parser | 141 +++++++++++++++ bpkg/manifest-parser.cxx | 453 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 599 insertions(+) create mode 100644 bpkg/buildfile create mode 100644 bpkg/manifest-parser create mode 100644 bpkg/manifest-parser.cxx (limited to 'bpkg') diff --git a/bpkg/buildfile b/bpkg/buildfile new file mode 100644 index 0000000..b7c1c9f --- /dev/null +++ b/bpkg/buildfile @@ -0,0 +1,5 @@ +# file : bpkg/buildfile +# copyright : Copyright (c) 2014-2015 Code Synthesis Ltd +# license : MIT; see accompanying LICENSE file + +lib{bpkg}: cxx{manifest-parser} diff --git a/bpkg/manifest-parser b/bpkg/manifest-parser new file mode 100644 index 0000000..2b1e4a5 --- /dev/null +++ b/bpkg/manifest-parser @@ -0,0 +1,141 @@ +// file : bpkg/manifest-parser -*- C++ -*- +// copyright : Copyright (c) 2014-2015 Code Synthesis Ltd +// license : MIT; see accompanying LICENSE file + +#ifndef BPKG_MANIFEST_PARSER +#define BPKG_MANIFEST_PARSER + +#include +#include +#include // uint64_t +#include // runtime_error + +namespace bpkg +{ + class manifest_parsing: public std::runtime_error + { + public: + manifest_parsing (const std::string& name, + std::uint64_t line, + std::uint64_t column, + const std::string& description); + + std::string name; + std::uint64_t line; + std::uint64_t column; + std::string description; + }; + + class manifest_parser + { + public: + manifest_parser (std::istream& is, const std::string& name) + : is_ (is), name_ (name) {} + + const std::string& + name () const {return name_;} + + struct name_value_type + { + std::string name; + std::string value; + + std::uint64_t name_line; + std::uint64_t name_column; + + std::uint64_t value_line; + std::uint64_t value_column; + }; + + // The first returned pair is special "start-of-manifest" with + // empty name and value being the format version: {"", ""}. + // After that we have a sequence of ordinary pairs which are + // the manifest. At the end of the manifest we have the special + // "end-of-manifest" pair with empty name and value: {"", ""}. + // After that we can either get another start-of-manifest pair + // (in which case the whole sequence repeats from the beginning) + // or we get another end-of-manifest pair which signals the end + // of stream (aka EOF). To put it another way, the parse sequence + // always has the following form: + // + // ({"", ""} {"", ""}* {"", ""})* {"", ""} + // + name_value_type + next (); + + private: + class xchar + { + public: + typedef std::char_traits traits_type; + typedef traits_type::int_type int_type; + typedef traits_type::char_type char_type; + + xchar (int_type v, std::uint64_t l, std::uint64_t c) + : v_ (v), l_ (l), c_ (c) {} + + operator char_type () const {return static_cast (v_);} + + int_type + value () const {return v_;} + + std::uint64_t line () const {return l_;} + std::uint64_t column () const {return c_;} + + private: + int_type v_; + std::uint64_t l_; + std::uint64_t c_; + }; + + private: + void + parse_name (name_value_type&); + + void + parse_value (name_value_type&); + + // Skip spaces and return the first peeked non-space character. + // + xchar + skip_spaces (); + + // Character interface. + // + private: + xchar + peek (); + + xchar + get (); + + void + unget (const xchar&); + + // Tests. + // + bool + is_eos (const xchar& c) const + { + return c.value () == xchar::traits_type::eof (); + } + + private: + enum {start, body, eos} s_ = start; + std::string version_; // Current format version. + + private: + std::istream& is_; + const std::string name_; + + std::uint64_t l_ {1}; + std::uint64_t c_ {1}; + + bool unget_ {false}; + xchar buf_ {0, 0, 0}; + + bool eos_ {false}; + }; +} + +#endif // BPKG_MANIFEST_PARSER diff --git a/bpkg/manifest-parser.cxx b/bpkg/manifest-parser.cxx new file mode 100644 index 0000000..374de3e --- /dev/null +++ b/bpkg/manifest-parser.cxx @@ -0,0 +1,453 @@ +// file : bpkg/manifest-parser.cxx -*- C++ -*- +// copyright : Copyright (c) 2014-2015 Code Synthesis Ltd +// license : MIT; see accompanying LICENSE file + +#include + +#include +#include + +using namespace std; + +namespace bpkg +{ + using name_value = manifest_parser::name_value_type; + + name_value manifest_parser:: + next () + { + if (s_ == eos) + return name_value {"", "", l_, c_, l_, c_}; + + xchar c (skip_spaces ()); + + // Here is the problem: if we are in the 'body' state (that is, + // we are parsing inside the manifest) and we see the special + // empty name, then before returning the "start" pair for the + // next manifest, we have to return the "end" pair. One way + // would be to cache the "start" pair and return it on the + // next call of next(). But that would require quite a bit + // of extra logic. The alternative is to detect the beginning + // of the empty name before parsing too far. This way, the + // next call to next() will start parsing where we left of + // and return the "start" pair naturally. + // + if (s_ == body && c == ':') + { + s_ = start; + uint64_t ln (c.line ()), cn (c.column ()); + return name_value {"", "", ln, cn, ln, cn}; + } + + // Regardless of the state, what should come next is a name, + // potentially the special empty one. + // + name_value r; + parse_name (r); + + skip_spaces (); + c = get (); + + if (is_eos (c)) + { + // This is ok as long as the name is empty. + // + if (!r.name.empty ()) + throw manifest_parsing (name_, c.line (), c.column (), + "':' expected after name"); + + s_ = eos; + + // The "end" pair. + // + r.value_line = r.name_line; + r.value_column = r.name_column; + return r; + } + + if (c != ':') + throw manifest_parsing (name_, c.line (), c.column (), + "':' expected after name"); + + skip_spaces (); + parse_value (r); + + c = peek (); + + // The character after the value should be either a newline or eos. + // + assert (c == '\n' || is_eos (c)); + + if (c == '\n') + get (); + + // Now figure out whether what we've got makes sense, depending + // on the state we are in. + // + if (s_ == start) + { + // Start of the (next) manifest. The first pair should be the + // special empty name/format version. + // + if (!r.name.empty ()) + throw manifest_parsing (name_, r.name_line, r.name_column, + "format version pair expected"); + + // The version value is only mandatory for the first manifest in + // a sequence. + // + if (r.value.empty ()) + { + if (version_.empty ()) + throw manifest_parsing (name_, r.value_line, r.value_column, + "format version value expected"); + r.value = version_; + } + else + { + version_ = r.value; // Update with the latest. + + if (version_ != "1") + throw manifest_parsing (name_, r.value_line, r.value_column, + "unsupported format version " + version_); + } + + s_ = body; + } + else + { + // Parsing the body of the manifest. + // + + // Should have been handled by the special case above. + // + assert (!r.name.empty ()); + } + + return r; + } + + void manifest_parser:: + parse_name (name_value& r) + { + xchar c (peek ()); + + r.name_line = c.line (); + r.name_column = c.column (); + + for (; !is_eos (c); c = peek ()) + { + if (c == ':' || c == ' ' || c == '\t' || c == '\n') + break; + + r.name += c; + get (); + } + } + + void manifest_parser:: + parse_value (name_value& r) + { + xchar c (peek ()); + + r.value_line = c.line (); + r.value_column = c.column (); + + string& v (r.value); + string::size_type n (0); // Size of last non-space character (simpel mode). + + // Detect the multi-line mode introductor. + // + bool ml (false); + if (c == '\\') + { + get (); + xchar p (peek ()); + + if (p == '\n') + { + get (); // Newline is not part of the value so skip it. + c = peek (); + ml = true; + } + else if (is_eos (p)) + ml = true; + else + unget (c); + } + + // The nl flag signals that the preceding character was a "special + // newline", that is, a newline that was part of the milti-line mode + // introductor or an escape sequence. + // + for (bool nl (ml); !is_eos (c); c = peek ()) + { + // Detect the special "\n\\\n" sequence. In the multi-line mode, + // this is a "terminator". In the simple mode, this is a way to + // specify a newline. + // + // The key idea here is this: if we "swallowed" any characters + // (i.e., called get() without a matching unget()), then we + // have to restart the loop in order to do all the tests for + // the next character. Also, for this to work, we can only + // add one character to v, which limits us to maximum three + // characters look-ahead: one in v, one "ungot", and one + // peeked. + // + // The first block handles the special sequence that starts with + // a special newline. In multi-line mode, this is an "immediate + // termination" where we "use" the newline from the introductor. + // Note also that in the simple mode the special sequence can + // only start with a special (i.e., escaped) newline. + // + if (nl) + { + nl = false; + + if (c == '\\') + { + get (); + xchar c1 (peek ()); + + if (c1 == '\n' || is_eos (c1)) + { + if (ml) + break; + else + { + if (c1 == '\n') + get (); + + v += '\n'; // Literal newline. + n++; + continue; // Restart from the next character. + } + } + else + unget (c); // Fall through. + } + } + + if (c == '\n') + { + if (ml) + { + get (); + xchar c1 (peek ()); + + if (c1 == '\\') + { + get (); + xchar c2 (peek ()); + + if (c2 == '\n' || is_eos (c2)) + break; + else + { + v += '\n'; + unget (c1); + continue; // Restart from c1 (slash). + } + } + else + unget (c); // Fall through. + } + else + break; // Simple value terminator. + } + + // Detect the newline escape sequence. The same look-ahead + // approach as above. + // + if (c == '\\') + { + get (); + xchar c1 (peek ()); + + if (c1 == '\n' || is_eos (c1)) + { + if (c1 == '\n') + { + get (); + nl = true; // This is a special newline. + } + continue; // Restart from the next character. + } + else if (c1 == '\\') + { + get (); + xchar c2 (peek ()); + + if (c2 == '\n' || is_eos (c1)) + { + v += '\\'; + n++; + // Restart from c2 (newline/eos). + } + else + { + v += '\\'; + n++; + unget (c1); // Restart from c1 (second slash). + } + + continue; + } + else + unget (c); // Fall through. + } + + get (); + v += c; + + if (!ml) + { + if (c != ' ' && c != '\t') + n++; + } + } + + // Cut off trailing whitespaces. + // + if (!ml) + v.resize (n); + } + + manifest_parser::xchar manifest_parser:: + skip_spaces () + { + xchar c (peek ()); + bool start (c.column () == 1); + + for (; !is_eos (c); c = peek ()) + { + switch (c) + { + case ' ': + case '\t': + break; + case '\n': + { + // Skip empty lines. + // + if (!start) + return c; + + break; + } + case '#': + { + // We only recognize '#' as a start of a comment at the beginning + // of the line (sans leading spaces). + // + if (!start) + return c; + + get (); + + // Read until newline or eos. + // + for (c = peek (); !is_eos (c) && c != '\n'; c = peek ()) + get (); + + continue; + } + default: + return c; // Not a space. + } + + get (); + } + + return c; + } + + // Character interface. + // + + manifest_parser::xchar manifest_parser:: + peek () + { + if (unget_) + return buf_; + else + { + if (eos_) + return xchar (xchar::traits_type::eof (), l_, c_); + else + { + xchar::int_type v (is_.peek ()); + + if (v == xchar::traits_type::eof ()) + eos_ = true; + + return xchar (v, l_, c_); + } + } + } + + manifest_parser::xchar manifest_parser:: + get () + { + if (unget_) + { + unget_ = false; + return buf_; + } + else + { + // When is_.get () returns eof, the failbit is also set (stupid, + // isn't?) which may trigger an exception. To work around this + // we will call peek() first and only call get() if it is not + // eof. But we can only call peek() on eof once; any subsequent + // calls will spoil the failbit (even more stupid). + // + xchar c (peek ()); + + if (!is_eos (c)) + { + is_.get (); + + if (c == '\n') + { + l_++; + c_ = 1; + } + else + c_++; + } + + return c; + } + } + + void manifest_parser:: + unget (const xchar& c) + { + // Because iostream::unget cannot work once eos is reached, + // we have to provide our own implementation. + // + buf_ = c; + unget_ = true; + } + + // manifest_parsing + // + + static string + format (const string& n, uint64_t l, uint64_t c, const string& d) + { + ostringstream os; + if (!n.empty ()) + os << n << ':'; + os << l << ':' << c << ": error: " << d; + return os.str (); + } + + manifest_parsing:: + manifest_parsing (const string& n, uint64_t l, uint64_t c, const string& d) + : runtime_error (format (n, l, c, d)), + name (n), line (l), column (c), description (d) + { + } +} -- cgit v1.1