aboutsummaryrefslogtreecommitdiff
path: root/butl
diff options
context:
space:
mode:
authorKaren Arutyunov <karen@codesynthesis.com>2016-09-29 21:54:14 +0300
committerKaren Arutyunov <karen@codesynthesis.com>2016-09-29 23:28:03 +0300
commit25a9484378ddaae9602ec54532cdc03b1f1924ef (patch)
tree7aafb613337eb6c6aee4fef78b8345405c4d7f70 /butl
parentf4f6d906733027a7bd802e035b3e9852db7be967 (diff)
Add manifest_parser and manifest_serializer
Diffstat (limited to 'butl')
-rw-r--r--butl/buildfile45
-rw-r--r--butl/manifest-forward15
-rw-r--r--butl/manifest-parser94
-rw-r--r--butl/manifest-parser.cxx379
-rw-r--r--butl/manifest-serializer75
-rw-r--r--butl/manifest-serializer.cxx238
6 files changed, 825 insertions, 21 deletions
diff --git a/butl/buildfile b/butl/buildfile
index 1a9787a..fcb5f86 100644
--- a/butl/buildfile
+++ b/butl/buildfile
@@ -2,27 +2,30 @@
# copyright : Copyright (c) 2014-2016 Code Synthesis Ltd
# license : MIT; see accompanying LICENSE file
-lib{butl}: \
-{hxx cxx}{ base64 } \
-{hxx cxx}{ char-scanner } \
-{hxx }{ export } \
-{hxx ixx cxx}{ fdstream } \
-{hxx ixx cxx}{ filesystem } \
-{hxx }{ multi-index } \
-{hxx }{ optional } \
-{hxx cxx}{ pager } \
-{hxx ixx txx cxx}{ path } \
-{hxx }{ path-io } \
-{hxx }{ path-map } \
-{hxx txx }{ prefix-map } \
-{hxx ixx cxx}{ process } \
-{hxx cxx}{ sha256 } \
-{hxx txx }{ string-table } \
-{hxx cxx}{ timestamp } \
-{hxx cxx}{ triplet } \
-{hxx ixx }{ utility } \
-{hxx }{ vector-view } \
-{hxx }{ version }
+lib{butl}: \
+{hxx cxx}{ base64 } \
+{hxx cxx}{ char-scanner } \
+{hxx }{ export } \
+{hxx ixx cxx}{ fdstream } \
+{hxx ixx cxx}{ filesystem } \
+{hxx }{ manifest-forward } \
+{hxx cxx}{ manifest-parser } \
+{hxx cxx}{ manifest-serializer } \
+{hxx }{ multi-index } \
+{hxx }{ optional } \
+{hxx cxx}{ pager } \
+{hxx ixx txx cxx}{ path } \
+{hxx }{ path-io } \
+{hxx }{ path-map } \
+{hxx txx }{ prefix-map } \
+{hxx ixx cxx}{ process } \
+{hxx cxx}{ sha256 } \
+{hxx txx }{ string-table } \
+{hxx cxx}{ timestamp } \
+{hxx cxx}{ triplet } \
+{hxx ixx }{ utility } \
+{hxx }{ vector-view } \
+{hxx }{ version }
# Exclude these from compilation on non-Windows targets.
#
diff --git a/butl/manifest-forward b/butl/manifest-forward
new file mode 100644
index 0000000..5dc5060
--- /dev/null
+++ b/butl/manifest-forward
@@ -0,0 +1,15 @@
+// file : butl/manifest-forward -*- C++ -*-
+// copyright : Copyright (c) 2014-2016 Code Synthesis Ltd
+// license : MIT; see accompanying LICENSE file
+
+#ifndef BUTL_MANIFEST_FORWARD
+#define BUTL_MANIFEST_FORWARD
+
+namespace butl
+{
+ class manifest_parser;
+ class manifest_serializer;
+ class manifest_name_value;
+}
+
+#endif // BUTL_MANIFEST_FORWARD
diff --git a/butl/manifest-parser b/butl/manifest-parser
new file mode 100644
index 0000000..a005b34
--- /dev/null
+++ b/butl/manifest-parser
@@ -0,0 +1,94 @@
+// file : butl/manifest-parser -*- C++ -*-
+// copyright : Copyright (c) 2014-2016 Code Synthesis Ltd
+// license : MIT; see accompanying LICENSE file
+
+#ifndef BUTL_MANIFEST_PARSER
+#define BUTL_MANIFEST_PARSER
+
+#include <string>
+#include <iosfwd>
+#include <cstdint> // uint64_t
+#include <stdexcept> // runtime_error
+
+#include <butl/export>
+
+#include <butl/char-scanner>
+
+namespace butl
+{
+ class LIBBUTL_EXPORT manifest_parsing: public std::runtime_error
+ {
+ public:
+ manifest_parsing (const std::string& name,
+ std::uint64_t line,
+ std::uint64_t column,
+ const std::string& description);
+
+ std::string name;
+ std::uint64_t line;
+ std::uint64_t column;
+ std::string description;
+ };
+
+ class manifest_name_value
+ {
+ public:
+ std::string name;
+ std::string value;
+
+ std::uint64_t name_line;
+ std::uint64_t name_column;
+
+ std::uint64_t value_line;
+ std::uint64_t value_column;
+
+ bool
+ empty () const {return name.empty () && value.empty ();}
+ };
+
+ class LIBBUTL_EXPORT manifest_parser: protected butl::char_scanner
+ {
+ public:
+ manifest_parser (std::istream& is, const std::string& name)
+ : char_scanner (is), name_ (name) {}
+
+ const std::string&
+ name () const {return name_;}
+
+ // The first returned pair is special "start-of-manifest" with
+ // empty name and value being the format version: {"", "<ver>"}.
+ // After that we have a sequence of ordinary pairs which are
+ // the manifest. At the end of the manifest we have the special
+ // "end-of-manifest" pair with empty name and value: {"", ""}.
+ // After that we can either get another start-of-manifest pair
+ // (in which case the whole sequence repeats from the beginning)
+ // or we get another end-of-manifest pair which signals the end
+ // of stream (aka EOF). To put it another way, the parse sequence
+ // always has the following form:
+ //
+ // ({"", "<ver>"} {"<name>", "<value>"}* {"", ""})* {"", ""}
+ //
+ manifest_name_value
+ next ();
+
+ private:
+ void
+ parse_name (manifest_name_value&);
+
+ void
+ parse_value (manifest_name_value&);
+
+ // Skip spaces and return the first peeked non-space character.
+ //
+ xchar
+ skip_spaces ();
+
+ private:
+ const std::string name_;
+
+ enum {start, body, end} s_ = start;
+ std::string version_; // Current format version.
+ };
+}
+
+#endif // BUTL_MANIFEST_PARSER
diff --git a/butl/manifest-parser.cxx b/butl/manifest-parser.cxx
new file mode 100644
index 0000000..ec26ca8
--- /dev/null
+++ b/butl/manifest-parser.cxx
@@ -0,0 +1,379 @@
+// file : butl/manifest-parser.cxx -*- C++ -*-
+// copyright : Copyright (c) 2014-2016 Code Synthesis Ltd
+// license : MIT; see accompanying LICENSE file
+
+#include <butl/manifest-parser>
+
+#include <cassert>
+#include <sstream>
+
+using namespace std;
+
+namespace butl
+{
+ using parsing = manifest_parsing;
+ using name_value = manifest_name_value;
+
+ name_value manifest_parser::
+ next ()
+ {
+ if (s_ == end)
+ return name_value {"", "", line, column, line, column};
+
+ xchar c (skip_spaces ());
+
+ // Here is the problem: if we are in the 'body' state (that is,
+ // we are parsing inside the manifest) and we see the special
+ // empty name, then before returning the "start" pair for the
+ // next manifest, we have to return the "end" pair. One way
+ // would be to cache the "start" pair and return it on the
+ // next call of next(). But that would require quite a bit
+ // of extra logic. The alternative is to detect the beginning
+ // of the empty name before parsing too far. This way, the
+ // next call to next() will start parsing where we left of
+ // and return the "start" pair naturally.
+ //
+ if (s_ == body && c == ':')
+ {
+ s_ = start;
+ return name_value {"", "", c.line, c.column, c.line, c.column};
+ }
+
+ // Regardless of the state, what should come next is a name,
+ // potentially the special empty one.
+ //
+ name_value r;
+ parse_name (r);
+
+ skip_spaces ();
+ c = get ();
+
+ if (eos (c))
+ {
+ // This is ok as long as the name is empty.
+ //
+ if (!r.name.empty ())
+ throw parsing (name_, c.line, c.column, "':' expected after name");
+
+ s_ = end;
+
+ // The "end" pair.
+ //
+ r.value_line = r.name_line;
+ r.value_column = r.name_column;
+ return r;
+ }
+
+ if (c != ':')
+ throw parsing (name_, c.line, c.column, "':' expected after name");
+
+ skip_spaces ();
+ parse_value (r);
+
+ c = peek ();
+
+ // The character after the value should be either a newline or eos.
+ //
+ assert (c == '\n' || eos (c));
+
+ if (c == '\n')
+ get ();
+
+ // Now figure out whether what we've got makes sense, depending
+ // on the state we are in.
+ //
+ if (s_ == start)
+ {
+ // Start of the (next) manifest. The first pair should be the
+ // special empty name/format version.
+ //
+ if (!r.name.empty ())
+ throw parsing (name_, r.name_line, r.name_column,
+ "format version pair expected");
+
+ // The version value is only mandatory for the first manifest in
+ // a sequence.
+ //
+ if (r.value.empty ())
+ {
+ if (version_.empty ())
+ throw parsing (name_, r.value_line, r.value_column,
+ "format version value expected");
+ r.value = version_;
+ }
+ else
+ {
+ version_ = r.value; // Update with the latest.
+
+ if (version_ != "1")
+ throw parsing (name_, r.value_line, r.value_column,
+ "unsupported format version " + version_);
+ }
+
+ s_ = body;
+ }
+ else
+ {
+ // Parsing the body of the manifest.
+ //
+
+ // Should have been handled by the special case above.
+ //
+ assert (!r.name.empty ());
+ }
+
+ return r;
+ }
+
+ void manifest_parser::
+ parse_name (name_value& r)
+ {
+ xchar c (peek ());
+
+ r.name_line = c.line;
+ r.name_column = c.column;
+
+ for (; !eos (c); c = peek ())
+ {
+ if (c == ':' || c == ' ' || c == '\t' || c == '\n')
+ break;
+
+ r.name += c;
+ get ();
+ }
+ }
+
+ void manifest_parser::
+ parse_value (name_value& r)
+ {
+ xchar c (peek ());
+
+ r.value_line = c.line;
+ r.value_column = c.column;
+
+ string& v (r.value);
+ string::size_type n (0); // Size of last non-space character (simple mode).
+
+ // Detect the multi-line mode introductor.
+ //
+ bool ml (false);
+ if (c == '\\')
+ {
+ get ();
+ xchar p (peek ());
+
+ if (p == '\n')
+ {
+ get (); // Newline is not part of the value so skip it.
+ c = peek ();
+ ml = true;
+ }
+ else if (eos (p))
+ ml = true;
+ else
+ unget (c);
+ }
+
+ // The nl flag signals that the preceding character was a "special
+ // newline", that is, a newline that was part of the milti-line mode
+ // introductor or an escape sequence.
+ //
+ for (bool nl (ml); !eos (c); c = peek ())
+ {
+ // Detect the special "\n\\\n" sequence. In the multi-line mode,
+ // this is a "terminator". In the simple mode, this is a way to
+ // specify a newline.
+ //
+ // The key idea here is this: if we "swallowed" any characters
+ // (i.e., called get() without a matching unget()), then we
+ // have to restart the loop in order to do all the tests for
+ // the next character. Also, for this to work, we can only
+ // add one character to v, which limits us to maximum three
+ // characters look-ahead: one in v, one "ungot", and one
+ // peeked.
+ //
+ // The first block handles the special sequence that starts with
+ // a special newline. In multi-line mode, this is an "immediate
+ // termination" where we "use" the newline from the introductor.
+ // Note also that in the simple mode the special sequence can
+ // only start with a special (i.e., escaped) newline.
+ //
+ if (nl)
+ {
+ nl = false;
+
+ if (c == '\\')
+ {
+ get ();
+ xchar c1 (peek ());
+
+ if (c1 == '\n' || eos (c1))
+ {
+ if (ml)
+ break;
+ else
+ {
+ if (c1 == '\n')
+ get ();
+
+ v += '\n'; // Literal newline.
+ n = v.size ();
+ continue; // Restart from the next character.
+ }
+ }
+ else
+ unget (c); // Fall through.
+ }
+ }
+
+ if (c == '\n')
+ {
+ if (ml)
+ {
+ get ();
+ xchar c1 (peek ());
+
+ if (c1 == '\\')
+ {
+ get ();
+ xchar c2 (peek ());
+
+ if (c2 == '\n' || eos (c2))
+ break;
+ else
+ {
+ v += '\n';
+ unget (c1);
+ continue; // Restart from c1 (slash).
+ }
+ }
+ else
+ unget (c); // Fall through.
+ }
+ else
+ break; // Simple value terminator.
+ }
+
+ // Detect the newline escape sequence. The same look-ahead
+ // approach as above.
+ //
+ if (c == '\\')
+ {
+ get ();
+ xchar c1 (peek ());
+
+ if (c1 == '\n' || eos (c1))
+ {
+ if (c1 == '\n')
+ {
+ get ();
+ nl = true; // This is a special newline.
+ }
+ continue; // Restart from the next character.
+ }
+ else if (c1 == '\\')
+ {
+ get ();
+ xchar c2 (peek ());
+
+ if (c2 == '\n' || eos (c1))
+ {
+ v += '\\';
+ n = v.size ();
+ // Restart from c2 (newline/eos).
+ }
+ else
+ {
+ v += '\\';
+ n = v.size ();
+ unget (c1); // Restart from c1 (second slash).
+ }
+
+ continue;
+ }
+ else
+ unget (c); // Fall through.
+ }
+
+ get ();
+ v += c;
+
+ if (!ml && c != ' ' && c != '\t')
+ n = v.size ();
+ }
+
+ // Cut off trailing whitespaces.
+ //
+ if (!ml)
+ v.resize (n);
+ }
+
+ manifest_parser::xchar manifest_parser::
+ skip_spaces ()
+ {
+ xchar c (peek ());
+ bool start (c.column == 1);
+
+ for (; !eos (c); c = peek ())
+ {
+ switch (c)
+ {
+ case ' ':
+ case '\t':
+ break;
+ case '\n':
+ {
+ // Skip empty lines.
+ //
+ if (!start)
+ return c;
+
+ break;
+ }
+ case '#':
+ {
+ // We only recognize '#' as a start of a comment at the beginning
+ // of the line (sans leading spaces).
+ //
+ if (!start)
+ return c;
+
+ get ();
+
+ // Read until newline or eos.
+ //
+ for (c = peek (); !eos (c) && c != '\n'; c = peek ())
+ get ();
+
+ continue;
+ }
+ default:
+ return c; // Not a space.
+ }
+
+ get ();
+ }
+
+ return c;
+ }
+
+ // manifest_parsing
+ //
+
+ static string
+ format (const string& n, uint64_t l, uint64_t c, const string& d)
+ {
+ ostringstream os;
+ if (!n.empty ())
+ os << n << ':';
+ os << l << ':' << c << ": error: " << d;
+ return os.str ();
+ }
+
+ manifest_parsing::
+ manifest_parsing (const string& n, uint64_t l, uint64_t c, const string& d)
+ : runtime_error (format (n, l, c, d)),
+ name (n), line (l), column (c), description (d)
+ {
+ }
+}
diff --git a/butl/manifest-serializer b/butl/manifest-serializer
new file mode 100644
index 0000000..6d7eeec
--- /dev/null
+++ b/butl/manifest-serializer
@@ -0,0 +1,75 @@
+// file : butl/manifest-serializer -*- C++ -*-
+// copyright : Copyright (c) 2014-2016 Code Synthesis Ltd
+// license : MIT; see accompanying LICENSE file
+
+#ifndef BUTL_MANIFEST_SERIALIZER
+#define BUTL_MANIFEST_SERIALIZER
+
+#include <string>
+#include <iosfwd>
+#include <cstddef> // size_t
+#include <stdexcept> // runtime_error
+
+#include <butl/export>
+
+namespace butl
+{
+ class LIBBUTL_EXPORT manifest_serialization: public std::runtime_error
+ {
+ public:
+ manifest_serialization (const std::string& name,
+ const std::string& description);
+
+ std::string name;
+ std::string description;
+ };
+
+ class LIBBUTL_EXPORT manifest_serializer
+ {
+ public:
+ manifest_serializer (std::ostream& os, const std::string& name)
+ : os_ (os), name_ (name) {}
+
+ const std::string&
+ name () const {return name_;}
+
+ // The first name-value pair should be the special "start-of-manifest"
+ // with empty name and value being the format version. After that we
+ // have a sequence of ordinary pairs which are the manifest. At the
+ // end of the manifest we have the special "end-of-manifest" pair
+ // with empty name and value. After that we can either have another
+ // start-of-manifest pair (in which case the whole sequence repeats
+ // from the beginning) or we get another end-of-manifest pair which
+ // signals the end of stream.
+ //
+ void
+ next (const std::string& name, const std::string& value);
+
+ // Write a comment. The supplied text is prefixed with "# " and
+ // terminated with a newline.
+ //
+ void
+ comment (const std::string&);
+
+ private:
+ void
+ check_name (const std::string&);
+
+ // Write 'n' characters from 's' (assuming there are no newlines)
+ // split into multiple lines at or near the 78 characters
+ // boundary. The first line starts at the 'column' offset.
+ //
+ void
+ write_value (std::size_t column, const char* s, std::size_t n);
+
+ private:
+ enum {start, body, end} s_ = start;
+ std::string version_; // Current format version.
+
+ private:
+ std::ostream& os_;
+ const std::string name_;
+ };
+}
+
+#endif // BUTL_MANIFEST_SERIALIZER
diff --git a/butl/manifest-serializer.cxx b/butl/manifest-serializer.cxx
new file mode 100644
index 0000000..c45aaba
--- /dev/null
+++ b/butl/manifest-serializer.cxx
@@ -0,0 +1,238 @@
+// file : butl/manifest-serializer.cxx -*- C++ -*-
+// copyright : Copyright (c) 2014-2016 Code Synthesis Ltd
+// license : MIT; see accompanying LICENSE file
+
+#include <butl/manifest-serializer>
+
+#include <ostream>
+#include <cassert>
+
+using namespace std;
+
+namespace butl
+{
+ using serialization = manifest_serialization;
+
+ void manifest_serializer::
+ next (const string& n, const string& v)
+ {
+ switch (s_)
+ {
+ case start:
+ {
+ if (!n.empty ())
+ throw serialization (name_, "format version pair expected");
+
+ if (v.empty ())
+ {
+ // End of manifests.
+ //
+ os_.flush ();
+ s_ = end;
+ break;
+ }
+
+ if (v != "1")
+ throw serialization (name_, "unsupported format version " + v);
+
+ os_ << ':';
+
+ if (v != version_)
+ {
+ os_ << ' ' << v;
+ version_ = v;
+ }
+
+ os_ << endl;
+ s_ = body;
+ break;
+ }
+ case body:
+ {
+ if (n.empty ())
+ {
+ if (!v.empty ())
+ throw serialization (name_, "non-empty value in end pair");
+
+ s_ = start;
+ break;
+ }
+
+ check_name (n);
+
+ os_ << n << ':';
+
+ if (!v.empty ())
+ {
+ os_ << ' ';
+
+ // Use the multi-line mode in any of the following cases:
+ //
+ // - name is too long (say longer than 37 (78/2 - 2) characters;
+ // we cannot start on the next line since that would start the
+ // multi-line mode)
+ // - value contains newlines
+ // - value contains leading/trailing whitespaces
+ //
+ if (n.size () > 37 ||
+ v.find ('\n') != string::npos ||
+ v.front () == ' ' || v.front () == '\t' ||
+ v.back () == ' ' || v.back () == '\t')
+ {
+ os_ << "\\" << endl; // Multi-line mode introductor.
+
+ // Chunk the value into fragments separated by newlines.
+ //
+ for (size_t i (0), p (v.find ('\n')); ; p = v.find ('\n', i))
+ {
+ if (p == string::npos)
+ {
+ // Last chunk.
+ //
+ write_value (0, v.c_str () + i, v.size () - i);
+ break;
+ }
+
+ write_value (0, v.c_str () + i, p - i);
+ os_ << endl;
+ i = p + 1;
+ }
+
+ os_ << endl << "\\"; // Multi-line mode terminator.
+ }
+ else
+ write_value (n.size () + 2, v.c_str (), v.size ());
+ }
+
+ os_ << endl;
+ break;
+ }
+ case end:
+ {
+ throw serialization (name_, "serialization after eos");
+ }
+ }
+ }
+
+ void manifest_serializer::
+ comment (const string& t)
+ {
+ if (s_ == end)
+ throw serialization (name_, "serialization after eos");
+
+ os_ << '#';
+
+ if (!t.empty ())
+ os_ << ' ' << t;
+
+ os_ << endl;
+ }
+
+ void manifest_serializer::
+ check_name (const string& n)
+ {
+ if (n[0] == '#')
+ throw serialization (name_, "name starts with '#'");
+
+ for (char c: n)
+ {
+ switch (c)
+ {
+ case ' ':
+ case '\t':
+ case '\n': throw serialization (name_, "name contains whitespace");
+ case ':': throw serialization (name_, "name contains ':'");
+ default: break;
+ }
+ }
+ }
+
+ void manifest_serializer::
+ write_value (size_t cl, const char* s, size_t n)
+ {
+ char c ('\0');
+
+ // The idea is to break on the 77th character (i.e., write it
+ // on the next line) which means we have written 76 characters
+ // on this line plus 2 for '\' and '\n', which gives us 78.
+ //
+ for (const char* e (s + n); s != e; s++, cl++)
+ {
+ c = *s;
+ bool br (false); // Break the line.
+
+ // If this is a whitespace, see if it's a good place to break the
+ // line.
+ //
+ if (c == ' ' || c == '\t')
+ {
+ // Find the next whitespace (or the end) and see if it is a better
+ // place.
+ //
+ for (const char* w (s + 1); ; w++)
+ {
+ if (w == e || *w == ' ' || *w == '\t')
+ {
+ // Is this whitespace past where we need to break? Also see
+ // below the "hard" break case for why we use 78 at the end.
+ //
+ if (cl + static_cast<size_t> (w - s) > (w != e ? 77 : 78))
+ {
+ // Only break if this whitespace is close enough to
+ // the end of the line.
+ //
+ br = (cl > 57);
+ }
+
+ break;
+ }
+ }
+ }
+
+ // Do we have to do a "hard" break (i.e., without a whitespace)?
+ // If there is just one character left, then instead of writing
+ // '\' and then the character on the next line, we might as well
+ // write it on this line.
+ //
+ if (cl == (s + 1 != e ? 77 : 78))
+ br = true;
+
+ if (br)
+ {
+ os_ << '\\' << endl;
+ cl = 0;
+ }
+
+ os_ << c;
+ }
+
+ // What comes next is always a newline. I the last character that
+ // we have written is a backslash, escape it.
+ //
+ if (c == '\\')
+ os_ << '\\';
+ }
+
+ // manifest_serialization
+ //
+
+ static string
+ format (const string& n, const string& d)
+ {
+ string r;
+ if (!n.empty ())
+ {
+ r += n;
+ r += ": ";
+ }
+ r += "error: ";
+ r += d;
+ return r;
+ }
+
+ manifest_serialization::
+ manifest_serialization (const string& n, const string& d)
+ : runtime_error (format (n, d)), name (n), description (d)
+ {
+ }
+}