aboutsummaryrefslogtreecommitdiff
path: root/bpkg/manifest-parser.cxx
diff options
context:
space:
mode:
authorBoris Kolpackov <boris@codesynthesis.com>2015-06-09 19:50:58 +0200
committerBoris Kolpackov <boris@codesynthesis.com>2015-06-09 19:50:58 +0200
commit72648921ec28903615698a61aeff4799e1ca9a7d (patch)
tree5c94e50453232a05561ed7967c0c4c19f86c83af /bpkg/manifest-parser.cxx
parent718263310d93081d615e35301f3a55cd91c3b2ea (diff)
Implement low-level manifest parser
Diffstat (limited to 'bpkg/manifest-parser.cxx')
-rw-r--r--bpkg/manifest-parser.cxx453
1 files changed, 453 insertions, 0 deletions
diff --git a/bpkg/manifest-parser.cxx b/bpkg/manifest-parser.cxx
new file mode 100644
index 0000000..374de3e
--- /dev/null
+++ b/bpkg/manifest-parser.cxx
@@ -0,0 +1,453 @@
+// file : bpkg/manifest-parser.cxx -*- C++ -*-
+// copyright : Copyright (c) 2014-2015 Code Synthesis Ltd
+// license : MIT; see accompanying LICENSE file
+
+#include <bpkg/manifest-parser>
+
+#include <cassert>
+#include <sstream>
+
+using namespace std;
+
+namespace bpkg
+{
+ using name_value = manifest_parser::name_value_type;
+
+ name_value manifest_parser::
+ next ()
+ {
+ if (s_ == eos)
+ return name_value {"", "", l_, c_, l_, c_};
+
+ xchar c (skip_spaces ());
+
+ // Here is the problem: if we are in the 'body' state (that is,
+ // we are parsing inside the manifest) and we see the special
+ // empty name, then before returning the "start" pair for the
+ // next manifest, we have to return the "end" pair. One way
+ // would be to cache the "start" pair and return it on the
+ // next call of next(). But that would require quite a bit
+ // of extra logic. The alternative is to detect the beginning
+ // of the empty name before parsing too far. This way, the
+ // next call to next() will start parsing where we left of
+ // and return the "start" pair naturally.
+ //
+ if (s_ == body && c == ':')
+ {
+ s_ = start;
+ uint64_t ln (c.line ()), cn (c.column ());
+ return name_value {"", "", ln, cn, ln, cn};
+ }
+
+ // Regardless of the state, what should come next is a name,
+ // potentially the special empty one.
+ //
+ name_value r;
+ parse_name (r);
+
+ skip_spaces ();
+ c = get ();
+
+ if (is_eos (c))
+ {
+ // This is ok as long as the name is empty.
+ //
+ if (!r.name.empty ())
+ throw manifest_parsing (name_, c.line (), c.column (),
+ "':' expected after name");
+
+ s_ = eos;
+
+ // The "end" pair.
+ //
+ r.value_line = r.name_line;
+ r.value_column = r.name_column;
+ return r;
+ }
+
+ if (c != ':')
+ throw manifest_parsing (name_, c.line (), c.column (),
+ "':' expected after name");
+
+ skip_spaces ();
+ parse_value (r);
+
+ c = peek ();
+
+ // The character after the value should be either a newline or eos.
+ //
+ assert (c == '\n' || is_eos (c));
+
+ if (c == '\n')
+ get ();
+
+ // Now figure out whether what we've got makes sense, depending
+ // on the state we are in.
+ //
+ if (s_ == start)
+ {
+ // Start of the (next) manifest. The first pair should be the
+ // special empty name/format version.
+ //
+ if (!r.name.empty ())
+ throw manifest_parsing (name_, r.name_line, r.name_column,
+ "format version pair expected");
+
+ // The version value is only mandatory for the first manifest in
+ // a sequence.
+ //
+ if (r.value.empty ())
+ {
+ if (version_.empty ())
+ throw manifest_parsing (name_, r.value_line, r.value_column,
+ "format version value expected");
+ r.value = version_;
+ }
+ else
+ {
+ version_ = r.value; // Update with the latest.
+
+ if (version_ != "1")
+ throw manifest_parsing (name_, r.value_line, r.value_column,
+ "unsupported format version " + version_);
+ }
+
+ s_ = body;
+ }
+ else
+ {
+ // Parsing the body of the manifest.
+ //
+
+ // Should have been handled by the special case above.
+ //
+ assert (!r.name.empty ());
+ }
+
+ return r;
+ }
+
+ void manifest_parser::
+ parse_name (name_value& r)
+ {
+ xchar c (peek ());
+
+ r.name_line = c.line ();
+ r.name_column = c.column ();
+
+ for (; !is_eos (c); c = peek ())
+ {
+ if (c == ':' || c == ' ' || c == '\t' || c == '\n')
+ break;
+
+ r.name += c;
+ get ();
+ }
+ }
+
+ void manifest_parser::
+ parse_value (name_value& r)
+ {
+ xchar c (peek ());
+
+ r.value_line = c.line ();
+ r.value_column = c.column ();
+
+ string& v (r.value);
+ string::size_type n (0); // Size of last non-space character (simpel mode).
+
+ // Detect the multi-line mode introductor.
+ //
+ bool ml (false);
+ if (c == '\\')
+ {
+ get ();
+ xchar p (peek ());
+
+ if (p == '\n')
+ {
+ get (); // Newline is not part of the value so skip it.
+ c = peek ();
+ ml = true;
+ }
+ else if (is_eos (p))
+ ml = true;
+ else
+ unget (c);
+ }
+
+ // The nl flag signals that the preceding character was a "special
+ // newline", that is, a newline that was part of the milti-line mode
+ // introductor or an escape sequence.
+ //
+ for (bool nl (ml); !is_eos (c); c = peek ())
+ {
+ // Detect the special "\n\\\n" sequence. In the multi-line mode,
+ // this is a "terminator". In the simple mode, this is a way to
+ // specify a newline.
+ //
+ // The key idea here is this: if we "swallowed" any characters
+ // (i.e., called get() without a matching unget()), then we
+ // have to restart the loop in order to do all the tests for
+ // the next character. Also, for this to work, we can only
+ // add one character to v, which limits us to maximum three
+ // characters look-ahead: one in v, one "ungot", and one
+ // peeked.
+ //
+ // The first block handles the special sequence that starts with
+ // a special newline. In multi-line mode, this is an "immediate
+ // termination" where we "use" the newline from the introductor.
+ // Note also that in the simple mode the special sequence can
+ // only start with a special (i.e., escaped) newline.
+ //
+ if (nl)
+ {
+ nl = false;
+
+ if (c == '\\')
+ {
+ get ();
+ xchar c1 (peek ());
+
+ if (c1 == '\n' || is_eos (c1))
+ {
+ if (ml)
+ break;
+ else
+ {
+ if (c1 == '\n')
+ get ();
+
+ v += '\n'; // Literal newline.
+ n++;
+ continue; // Restart from the next character.
+ }
+ }
+ else
+ unget (c); // Fall through.
+ }
+ }
+
+ if (c == '\n')
+ {
+ if (ml)
+ {
+ get ();
+ xchar c1 (peek ());
+
+ if (c1 == '\\')
+ {
+ get ();
+ xchar c2 (peek ());
+
+ if (c2 == '\n' || is_eos (c2))
+ break;
+ else
+ {
+ v += '\n';
+ unget (c1);
+ continue; // Restart from c1 (slash).
+ }
+ }
+ else
+ unget (c); // Fall through.
+ }
+ else
+ break; // Simple value terminator.
+ }
+
+ // Detect the newline escape sequence. The same look-ahead
+ // approach as above.
+ //
+ if (c == '\\')
+ {
+ get ();
+ xchar c1 (peek ());
+
+ if (c1 == '\n' || is_eos (c1))
+ {
+ if (c1 == '\n')
+ {
+ get ();
+ nl = true; // This is a special newline.
+ }
+ continue; // Restart from the next character.
+ }
+ else if (c1 == '\\')
+ {
+ get ();
+ xchar c2 (peek ());
+
+ if (c2 == '\n' || is_eos (c1))
+ {
+ v += '\\';
+ n++;
+ // Restart from c2 (newline/eos).
+ }
+ else
+ {
+ v += '\\';
+ n++;
+ unget (c1); // Restart from c1 (second slash).
+ }
+
+ continue;
+ }
+ else
+ unget (c); // Fall through.
+ }
+
+ get ();
+ v += c;
+
+ if (!ml)
+ {
+ if (c != ' ' && c != '\t')
+ n++;
+ }
+ }
+
+ // Cut off trailing whitespaces.
+ //
+ if (!ml)
+ v.resize (n);
+ }
+
+ manifest_parser::xchar manifest_parser::
+ skip_spaces ()
+ {
+ xchar c (peek ());
+ bool start (c.column () == 1);
+
+ for (; !is_eos (c); c = peek ())
+ {
+ switch (c)
+ {
+ case ' ':
+ case '\t':
+ break;
+ case '\n':
+ {
+ // Skip empty lines.
+ //
+ if (!start)
+ return c;
+
+ break;
+ }
+ case '#':
+ {
+ // We only recognize '#' as a start of a comment at the beginning
+ // of the line (sans leading spaces).
+ //
+ if (!start)
+ return c;
+
+ get ();
+
+ // Read until newline or eos.
+ //
+ for (c = peek (); !is_eos (c) && c != '\n'; c = peek ())
+ get ();
+
+ continue;
+ }
+ default:
+ return c; // Not a space.
+ }
+
+ get ();
+ }
+
+ return c;
+ }
+
+ // Character interface.
+ //
+
+ manifest_parser::xchar manifest_parser::
+ peek ()
+ {
+ if (unget_)
+ return buf_;
+ else
+ {
+ if (eos_)
+ return xchar (xchar::traits_type::eof (), l_, c_);
+ else
+ {
+ xchar::int_type v (is_.peek ());
+
+ if (v == xchar::traits_type::eof ())
+ eos_ = true;
+
+ return xchar (v, l_, c_);
+ }
+ }
+ }
+
+ manifest_parser::xchar manifest_parser::
+ get ()
+ {
+ if (unget_)
+ {
+ unget_ = false;
+ return buf_;
+ }
+ else
+ {
+ // When is_.get () returns eof, the failbit is also set (stupid,
+ // isn't?) which may trigger an exception. To work around this
+ // we will call peek() first and only call get() if it is not
+ // eof. But we can only call peek() on eof once; any subsequent
+ // calls will spoil the failbit (even more stupid).
+ //
+ xchar c (peek ());
+
+ if (!is_eos (c))
+ {
+ is_.get ();
+
+ if (c == '\n')
+ {
+ l_++;
+ c_ = 1;
+ }
+ else
+ c_++;
+ }
+
+ return c;
+ }
+ }
+
+ void manifest_parser::
+ unget (const xchar& c)
+ {
+ // Because iostream::unget cannot work once eos is reached,
+ // we have to provide our own implementation.
+ //
+ buf_ = c;
+ unget_ = true;
+ }
+
+ // manifest_parsing
+ //
+
+ static string
+ format (const string& n, uint64_t l, uint64_t c, const string& d)
+ {
+ ostringstream os;
+ if (!n.empty ())
+ os << n << ':';
+ os << l << ':' << c << ": error: " << d;
+ return os.str ();
+ }
+
+ manifest_parsing::
+ manifest_parsing (const string& n, uint64_t l, uint64_t c, const string& d)
+ : runtime_error (format (n, l, c, d)),
+ name (n), line (l), column (c), description (d)
+ {
+ }
+}