diff options
author | Karen Arutyunov <karen@codesynthesis.com> | 2016-09-29 21:54:14 +0300 |
---|---|---|
committer | Karen Arutyunov <karen@codesynthesis.com> | 2016-09-29 23:28:03 +0300 |
commit | 25a9484378ddaae9602ec54532cdc03b1f1924ef (patch) | |
tree | 7aafb613337eb6c6aee4fef78b8345405c4d7f70 /butl | |
parent | f4f6d906733027a7bd802e035b3e9852db7be967 (diff) |
Add manifest_parser and manifest_serializer
Diffstat (limited to 'butl')
-rw-r--r-- | butl/buildfile | 45 | ||||
-rw-r--r-- | butl/manifest-forward | 15 | ||||
-rw-r--r-- | butl/manifest-parser | 94 | ||||
-rw-r--r-- | butl/manifest-parser.cxx | 379 | ||||
-rw-r--r-- | butl/manifest-serializer | 75 | ||||
-rw-r--r-- | butl/manifest-serializer.cxx | 238 |
6 files changed, 825 insertions, 21 deletions
diff --git a/butl/buildfile b/butl/buildfile index 1a9787a..fcb5f86 100644 --- a/butl/buildfile +++ b/butl/buildfile @@ -2,27 +2,30 @@ # copyright : Copyright (c) 2014-2016 Code Synthesis Ltd # license : MIT; see accompanying LICENSE file -lib{butl}: \ -{hxx cxx}{ base64 } \ -{hxx cxx}{ char-scanner } \ -{hxx }{ export } \ -{hxx ixx cxx}{ fdstream } \ -{hxx ixx cxx}{ filesystem } \ -{hxx }{ multi-index } \ -{hxx }{ optional } \ -{hxx cxx}{ pager } \ -{hxx ixx txx cxx}{ path } \ -{hxx }{ path-io } \ -{hxx }{ path-map } \ -{hxx txx }{ prefix-map } \ -{hxx ixx cxx}{ process } \ -{hxx cxx}{ sha256 } \ -{hxx txx }{ string-table } \ -{hxx cxx}{ timestamp } \ -{hxx cxx}{ triplet } \ -{hxx ixx }{ utility } \ -{hxx }{ vector-view } \ -{hxx }{ version } +lib{butl}: \ +{hxx cxx}{ base64 } \ +{hxx cxx}{ char-scanner } \ +{hxx }{ export } \ +{hxx ixx cxx}{ fdstream } \ +{hxx ixx cxx}{ filesystem } \ +{hxx }{ manifest-forward } \ +{hxx cxx}{ manifest-parser } \ +{hxx cxx}{ manifest-serializer } \ +{hxx }{ multi-index } \ +{hxx }{ optional } \ +{hxx cxx}{ pager } \ +{hxx ixx txx cxx}{ path } \ +{hxx }{ path-io } \ +{hxx }{ path-map } \ +{hxx txx }{ prefix-map } \ +{hxx ixx cxx}{ process } \ +{hxx cxx}{ sha256 } \ +{hxx txx }{ string-table } \ +{hxx cxx}{ timestamp } \ +{hxx cxx}{ triplet } \ +{hxx ixx }{ utility } \ +{hxx }{ vector-view } \ +{hxx }{ version } # Exclude these from compilation on non-Windows targets. # diff --git a/butl/manifest-forward b/butl/manifest-forward new file mode 100644 index 0000000..5dc5060 --- /dev/null +++ b/butl/manifest-forward @@ -0,0 +1,15 @@ +// file : butl/manifest-forward -*- C++ -*- +// copyright : Copyright (c) 2014-2016 Code Synthesis Ltd +// license : MIT; see accompanying LICENSE file + +#ifndef BUTL_MANIFEST_FORWARD +#define BUTL_MANIFEST_FORWARD + +namespace butl +{ + class manifest_parser; + class manifest_serializer; + class manifest_name_value; +} + +#endif // BUTL_MANIFEST_FORWARD diff --git a/butl/manifest-parser b/butl/manifest-parser new file mode 100644 index 0000000..a005b34 --- /dev/null +++ b/butl/manifest-parser @@ -0,0 +1,94 @@ +// file : butl/manifest-parser -*- C++ -*- +// copyright : Copyright (c) 2014-2016 Code Synthesis Ltd +// license : MIT; see accompanying LICENSE file + +#ifndef BUTL_MANIFEST_PARSER +#define BUTL_MANIFEST_PARSER + +#include <string> +#include <iosfwd> +#include <cstdint> // uint64_t +#include <stdexcept> // runtime_error + +#include <butl/export> + +#include <butl/char-scanner> + +namespace butl +{ + class LIBBUTL_EXPORT manifest_parsing: public std::runtime_error + { + public: + manifest_parsing (const std::string& name, + std::uint64_t line, + std::uint64_t column, + const std::string& description); + + std::string name; + std::uint64_t line; + std::uint64_t column; + std::string description; + }; + + class manifest_name_value + { + public: + std::string name; + std::string value; + + std::uint64_t name_line; + std::uint64_t name_column; + + std::uint64_t value_line; + std::uint64_t value_column; + + bool + empty () const {return name.empty () && value.empty ();} + }; + + class LIBBUTL_EXPORT manifest_parser: protected butl::char_scanner + { + public: + manifest_parser (std::istream& is, const std::string& name) + : char_scanner (is), name_ (name) {} + + const std::string& + name () const {return name_;} + + // The first returned pair is special "start-of-manifest" with + // empty name and value being the format version: {"", "<ver>"}. + // After that we have a sequence of ordinary pairs which are + // the manifest. At the end of the manifest we have the special + // "end-of-manifest" pair with empty name and value: {"", ""}. + // After that we can either get another start-of-manifest pair + // (in which case the whole sequence repeats from the beginning) + // or we get another end-of-manifest pair which signals the end + // of stream (aka EOF). To put it another way, the parse sequence + // always has the following form: + // + // ({"", "<ver>"} {"<name>", "<value>"}* {"", ""})* {"", ""} + // + manifest_name_value + next (); + + private: + void + parse_name (manifest_name_value&); + + void + parse_value (manifest_name_value&); + + // Skip spaces and return the first peeked non-space character. + // + xchar + skip_spaces (); + + private: + const std::string name_; + + enum {start, body, end} s_ = start; + std::string version_; // Current format version. + }; +} + +#endif // BUTL_MANIFEST_PARSER diff --git a/butl/manifest-parser.cxx b/butl/manifest-parser.cxx new file mode 100644 index 0000000..ec26ca8 --- /dev/null +++ b/butl/manifest-parser.cxx @@ -0,0 +1,379 @@ +// file : butl/manifest-parser.cxx -*- C++ -*- +// copyright : Copyright (c) 2014-2016 Code Synthesis Ltd +// license : MIT; see accompanying LICENSE file + +#include <butl/manifest-parser> + +#include <cassert> +#include <sstream> + +using namespace std; + +namespace butl +{ + using parsing = manifest_parsing; + using name_value = manifest_name_value; + + name_value manifest_parser:: + next () + { + if (s_ == end) + return name_value {"", "", line, column, line, column}; + + xchar c (skip_spaces ()); + + // Here is the problem: if we are in the 'body' state (that is, + // we are parsing inside the manifest) and we see the special + // empty name, then before returning the "start" pair for the + // next manifest, we have to return the "end" pair. One way + // would be to cache the "start" pair and return it on the + // next call of next(). But that would require quite a bit + // of extra logic. The alternative is to detect the beginning + // of the empty name before parsing too far. This way, the + // next call to next() will start parsing where we left of + // and return the "start" pair naturally. + // + if (s_ == body && c == ':') + { + s_ = start; + return name_value {"", "", c.line, c.column, c.line, c.column}; + } + + // Regardless of the state, what should come next is a name, + // potentially the special empty one. + // + name_value r; + parse_name (r); + + skip_spaces (); + c = get (); + + if (eos (c)) + { + // This is ok as long as the name is empty. + // + if (!r.name.empty ()) + throw parsing (name_, c.line, c.column, "':' expected after name"); + + s_ = end; + + // The "end" pair. + // + r.value_line = r.name_line; + r.value_column = r.name_column; + return r; + } + + if (c != ':') + throw parsing (name_, c.line, c.column, "':' expected after name"); + + skip_spaces (); + parse_value (r); + + c = peek (); + + // The character after the value should be either a newline or eos. + // + assert (c == '\n' || eos (c)); + + if (c == '\n') + get (); + + // Now figure out whether what we've got makes sense, depending + // on the state we are in. + // + if (s_ == start) + { + // Start of the (next) manifest. The first pair should be the + // special empty name/format version. + // + if (!r.name.empty ()) + throw parsing (name_, r.name_line, r.name_column, + "format version pair expected"); + + // The version value is only mandatory for the first manifest in + // a sequence. + // + if (r.value.empty ()) + { + if (version_.empty ()) + throw parsing (name_, r.value_line, r.value_column, + "format version value expected"); + r.value = version_; + } + else + { + version_ = r.value; // Update with the latest. + + if (version_ != "1") + throw parsing (name_, r.value_line, r.value_column, + "unsupported format version " + version_); + } + + s_ = body; + } + else + { + // Parsing the body of the manifest. + // + + // Should have been handled by the special case above. + // + assert (!r.name.empty ()); + } + + return r; + } + + void manifest_parser:: + parse_name (name_value& r) + { + xchar c (peek ()); + + r.name_line = c.line; + r.name_column = c.column; + + for (; !eos (c); c = peek ()) + { + if (c == ':' || c == ' ' || c == '\t' || c == '\n') + break; + + r.name += c; + get (); + } + } + + void manifest_parser:: + parse_value (name_value& r) + { + xchar c (peek ()); + + r.value_line = c.line; + r.value_column = c.column; + + string& v (r.value); + string::size_type n (0); // Size of last non-space character (simple mode). + + // Detect the multi-line mode introductor. + // + bool ml (false); + if (c == '\\') + { + get (); + xchar p (peek ()); + + if (p == '\n') + { + get (); // Newline is not part of the value so skip it. + c = peek (); + ml = true; + } + else if (eos (p)) + ml = true; + else + unget (c); + } + + // The nl flag signals that the preceding character was a "special + // newline", that is, a newline that was part of the milti-line mode + // introductor or an escape sequence. + // + for (bool nl (ml); !eos (c); c = peek ()) + { + // Detect the special "\n\\\n" sequence. In the multi-line mode, + // this is a "terminator". In the simple mode, this is a way to + // specify a newline. + // + // The key idea here is this: if we "swallowed" any characters + // (i.e., called get() without a matching unget()), then we + // have to restart the loop in order to do all the tests for + // the next character. Also, for this to work, we can only + // add one character to v, which limits us to maximum three + // characters look-ahead: one in v, one "ungot", and one + // peeked. + // + // The first block handles the special sequence that starts with + // a special newline. In multi-line mode, this is an "immediate + // termination" where we "use" the newline from the introductor. + // Note also that in the simple mode the special sequence can + // only start with a special (i.e., escaped) newline. + // + if (nl) + { + nl = false; + + if (c == '\\') + { + get (); + xchar c1 (peek ()); + + if (c1 == '\n' || eos (c1)) + { + if (ml) + break; + else + { + if (c1 == '\n') + get (); + + v += '\n'; // Literal newline. + n = v.size (); + continue; // Restart from the next character. + } + } + else + unget (c); // Fall through. + } + } + + if (c == '\n') + { + if (ml) + { + get (); + xchar c1 (peek ()); + + if (c1 == '\\') + { + get (); + xchar c2 (peek ()); + + if (c2 == '\n' || eos (c2)) + break; + else + { + v += '\n'; + unget (c1); + continue; // Restart from c1 (slash). + } + } + else + unget (c); // Fall through. + } + else + break; // Simple value terminator. + } + + // Detect the newline escape sequence. The same look-ahead + // approach as above. + // + if (c == '\\') + { + get (); + xchar c1 (peek ()); + + if (c1 == '\n' || eos (c1)) + { + if (c1 == '\n') + { + get (); + nl = true; // This is a special newline. + } + continue; // Restart from the next character. + } + else if (c1 == '\\') + { + get (); + xchar c2 (peek ()); + + if (c2 == '\n' || eos (c1)) + { + v += '\\'; + n = v.size (); + // Restart from c2 (newline/eos). + } + else + { + v += '\\'; + n = v.size (); + unget (c1); // Restart from c1 (second slash). + } + + continue; + } + else + unget (c); // Fall through. + } + + get (); + v += c; + + if (!ml && c != ' ' && c != '\t') + n = v.size (); + } + + // Cut off trailing whitespaces. + // + if (!ml) + v.resize (n); + } + + manifest_parser::xchar manifest_parser:: + skip_spaces () + { + xchar c (peek ()); + bool start (c.column == 1); + + for (; !eos (c); c = peek ()) + { + switch (c) + { + case ' ': + case '\t': + break; + case '\n': + { + // Skip empty lines. + // + if (!start) + return c; + + break; + } + case '#': + { + // We only recognize '#' as a start of a comment at the beginning + // of the line (sans leading spaces). + // + if (!start) + return c; + + get (); + + // Read until newline or eos. + // + for (c = peek (); !eos (c) && c != '\n'; c = peek ()) + get (); + + continue; + } + default: + return c; // Not a space. + } + + get (); + } + + return c; + } + + // manifest_parsing + // + + static string + format (const string& n, uint64_t l, uint64_t c, const string& d) + { + ostringstream os; + if (!n.empty ()) + os << n << ':'; + os << l << ':' << c << ": error: " << d; + return os.str (); + } + + manifest_parsing:: + manifest_parsing (const string& n, uint64_t l, uint64_t c, const string& d) + : runtime_error (format (n, l, c, d)), + name (n), line (l), column (c), description (d) + { + } +} diff --git a/butl/manifest-serializer b/butl/manifest-serializer new file mode 100644 index 0000000..6d7eeec --- /dev/null +++ b/butl/manifest-serializer @@ -0,0 +1,75 @@ +// file : butl/manifest-serializer -*- C++ -*- +// copyright : Copyright (c) 2014-2016 Code Synthesis Ltd +// license : MIT; see accompanying LICENSE file + +#ifndef BUTL_MANIFEST_SERIALIZER +#define BUTL_MANIFEST_SERIALIZER + +#include <string> +#include <iosfwd> +#include <cstddef> // size_t +#include <stdexcept> // runtime_error + +#include <butl/export> + +namespace butl +{ + class LIBBUTL_EXPORT manifest_serialization: public std::runtime_error + { + public: + manifest_serialization (const std::string& name, + const std::string& description); + + std::string name; + std::string description; + }; + + class LIBBUTL_EXPORT manifest_serializer + { + public: + manifest_serializer (std::ostream& os, const std::string& name) + : os_ (os), name_ (name) {} + + const std::string& + name () const {return name_;} + + // The first name-value pair should be the special "start-of-manifest" + // with empty name and value being the format version. After that we + // have a sequence of ordinary pairs which are the manifest. At the + // end of the manifest we have the special "end-of-manifest" pair + // with empty name and value. After that we can either have another + // start-of-manifest pair (in which case the whole sequence repeats + // from the beginning) or we get another end-of-manifest pair which + // signals the end of stream. + // + void + next (const std::string& name, const std::string& value); + + // Write a comment. The supplied text is prefixed with "# " and + // terminated with a newline. + // + void + comment (const std::string&); + + private: + void + check_name (const std::string&); + + // Write 'n' characters from 's' (assuming there are no newlines) + // split into multiple lines at or near the 78 characters + // boundary. The first line starts at the 'column' offset. + // + void + write_value (std::size_t column, const char* s, std::size_t n); + + private: + enum {start, body, end} s_ = start; + std::string version_; // Current format version. + + private: + std::ostream& os_; + const std::string name_; + }; +} + +#endif // BUTL_MANIFEST_SERIALIZER diff --git a/butl/manifest-serializer.cxx b/butl/manifest-serializer.cxx new file mode 100644 index 0000000..c45aaba --- /dev/null +++ b/butl/manifest-serializer.cxx @@ -0,0 +1,238 @@ +// file : butl/manifest-serializer.cxx -*- C++ -*- +// copyright : Copyright (c) 2014-2016 Code Synthesis Ltd +// license : MIT; see accompanying LICENSE file + +#include <butl/manifest-serializer> + +#include <ostream> +#include <cassert> + +using namespace std; + +namespace butl +{ + using serialization = manifest_serialization; + + void manifest_serializer:: + next (const string& n, const string& v) + { + switch (s_) + { + case start: + { + if (!n.empty ()) + throw serialization (name_, "format version pair expected"); + + if (v.empty ()) + { + // End of manifests. + // + os_.flush (); + s_ = end; + break; + } + + if (v != "1") + throw serialization (name_, "unsupported format version " + v); + + os_ << ':'; + + if (v != version_) + { + os_ << ' ' << v; + version_ = v; + } + + os_ << endl; + s_ = body; + break; + } + case body: + { + if (n.empty ()) + { + if (!v.empty ()) + throw serialization (name_, "non-empty value in end pair"); + + s_ = start; + break; + } + + check_name (n); + + os_ << n << ':'; + + if (!v.empty ()) + { + os_ << ' '; + + // Use the multi-line mode in any of the following cases: + // + // - name is too long (say longer than 37 (78/2 - 2) characters; + // we cannot start on the next line since that would start the + // multi-line mode) + // - value contains newlines + // - value contains leading/trailing whitespaces + // + if (n.size () > 37 || + v.find ('\n') != string::npos || + v.front () == ' ' || v.front () == '\t' || + v.back () == ' ' || v.back () == '\t') + { + os_ << "\\" << endl; // Multi-line mode introductor. + + // Chunk the value into fragments separated by newlines. + // + for (size_t i (0), p (v.find ('\n')); ; p = v.find ('\n', i)) + { + if (p == string::npos) + { + // Last chunk. + // + write_value (0, v.c_str () + i, v.size () - i); + break; + } + + write_value (0, v.c_str () + i, p - i); + os_ << endl; + i = p + 1; + } + + os_ << endl << "\\"; // Multi-line mode terminator. + } + else + write_value (n.size () + 2, v.c_str (), v.size ()); + } + + os_ << endl; + break; + } + case end: + { + throw serialization (name_, "serialization after eos"); + } + } + } + + void manifest_serializer:: + comment (const string& t) + { + if (s_ == end) + throw serialization (name_, "serialization after eos"); + + os_ << '#'; + + if (!t.empty ()) + os_ << ' ' << t; + + os_ << endl; + } + + void manifest_serializer:: + check_name (const string& n) + { + if (n[0] == '#') + throw serialization (name_, "name starts with '#'"); + + for (char c: n) + { + switch (c) + { + case ' ': + case '\t': + case '\n': throw serialization (name_, "name contains whitespace"); + case ':': throw serialization (name_, "name contains ':'"); + default: break; + } + } + } + + void manifest_serializer:: + write_value (size_t cl, const char* s, size_t n) + { + char c ('\0'); + + // The idea is to break on the 77th character (i.e., write it + // on the next line) which means we have written 76 characters + // on this line plus 2 for '\' and '\n', which gives us 78. + // + for (const char* e (s + n); s != e; s++, cl++) + { + c = *s; + bool br (false); // Break the line. + + // If this is a whitespace, see if it's a good place to break the + // line. + // + if (c == ' ' || c == '\t') + { + // Find the next whitespace (or the end) and see if it is a better + // place. + // + for (const char* w (s + 1); ; w++) + { + if (w == e || *w == ' ' || *w == '\t') + { + // Is this whitespace past where we need to break? Also see + // below the "hard" break case for why we use 78 at the end. + // + if (cl + static_cast<size_t> (w - s) > (w != e ? 77 : 78)) + { + // Only break if this whitespace is close enough to + // the end of the line. + // + br = (cl > 57); + } + + break; + } + } + } + + // Do we have to do a "hard" break (i.e., without a whitespace)? + // If there is just one character left, then instead of writing + // '\' and then the character on the next line, we might as well + // write it on this line. + // + if (cl == (s + 1 != e ? 77 : 78)) + br = true; + + if (br) + { + os_ << '\\' << endl; + cl = 0; + } + + os_ << c; + } + + // What comes next is always a newline. I the last character that + // we have written is a backslash, escape it. + // + if (c == '\\') + os_ << '\\'; + } + + // manifest_serialization + // + + static string + format (const string& n, const string& d) + { + string r; + if (!n.empty ()) + { + r += n; + r += ": "; + } + r += "error: "; + r += d; + return r; + } + + manifest_serialization:: + manifest_serialization (const string& n, const string& d) + : runtime_error (format (n, d)), name (n), description (d) + { + } +} |