Add JSON serializer (copy of libstud-json)

author: Boris Kolpackov <boris@codesynthesis.com> 2022-02-28 08:57:02 +0200
committer: Boris Kolpackov <boris@codesynthesis.com> 2022-03-02 13:58:55 +0200
commit: 152ea943395822f55591eadaf8e0f5aac263db5e (patch)
tree: 268a9dae37704b2398876dad44a5095ba6d6f750
parent: 0e5d575feceea4feac4b33e85626719e14f762a1 (diff)
4 files changed, 1277 insertions, 0 deletions
diff --git a/libbutl/json/event.hxx b/libbutl/json/event.hxx
new file mode 100644
index 0000000..77185cc
--- /dev/null
+++ b/libbutl/json/event.hxx
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <cstddef>
+#include <cstdint>
+
+namespace butl
+{
+  namespace json
+  {
+    // Parsing/serialization event.
+    //
+    enum class event: std::uint8_t
+    {
+      begin_object = 1,
+      end_object,
+      begin_array,
+      end_array,
+      name,
+      string,
+      number,
+      boolean,
+      null
+    };
+
+    constexpr std::size_t event_count = 9;
+  }
+}
diff --git a/libbutl/json/serializer.cxx b/libbutl/json/serializer.cxx
new file mode 100644
index 0000000..84941ed
--- /dev/null
+++ b/libbutl/json/serializer.cxx
@@ -0,0 +1,669 @@
+#include <cstdio>               // snprintf
+#include <cstdarg>              // va_list
+#include <cstring>              // memcpy
+#include <ostream>
+
+#include <libbutl/json/serializer.hxx>
+
+using namespace std;
+
+namespace butl
+{
+  namespace json
+  {
+    using buffer     = buffer_serializer::buffer;
+    using error_code = invalid_json_output::error_code;
+
+    template <typename T>
+    static void
+    dynarray_overflow (void* d, event, buffer& b, size_t ex)
+    {
+      T& v (*static_cast<T*> (d));
+      v.resize (b.capacity + ex);
+      v.resize (v.capacity ());
+      // const_cast is required for std::string pre C++17.
+      //
+      b.data = const_cast<typename T::value_type*> (v.data ());
+      b.capacity = v.size ();
+    }
+
+    template <typename T>
+    static void
+    dynarray_flush (void* d, event, buffer& b)
+    {
+      T& v (*static_cast<T*> (d));
+      v.resize (b.size);
+      b.data = const_cast<typename T::value_type*> (v.data ());
+      b.capacity = b.size;
+    }
+
+    buffer_serializer::
+    buffer_serializer (string& s, size_t i)
+        : buffer_serializer (const_cast<char*> (s.data ()), size_, s.size (),
+                             dynarray_overflow<string>,
+                             dynarray_flush<string>,
+                             &s,
+                             i)
+    {
+      size_ = s.size ();
+    }
+
+    buffer_serializer::
+    buffer_serializer (vector<char>& v, size_t i)
+        : buffer_serializer (v.data (), size_, v.size (),
+                             dynarray_overflow<vector<char>>,
+                             dynarray_flush<vector<char>>,
+                             &v,
+                             i)
+    {
+      size_ = v.size ();
+    }
+
+    static void
+    ostream_overflow (void* d, event e, buffer& b, size_t)
+    {
+      ostream& s (*static_cast<ostream*> (d));
+      s.write (static_cast<char*> (b.data), b.size);
+      if (s.fail ())
+        throw invalid_json_output (
+            e, error_code::buffer_overflow, "unable to write JSON output text");
+      b.size = 0;
+    }
+
+    static void
+    ostream_flush (void* d, event e, buffer& b)
+    {
+      ostream_overflow (d, e, b, 0);
+
+      ostream& s (*static_cast<ostream*> (d));
+      s.flush ();
+      if (s.fail ())
+        throw invalid_json_output (
+            e, error_code::buffer_overflow, "unable to write JSON output text");
+    }
+
+    stream_serializer::
+    stream_serializer (ostream& os, size_t i)
+        : buffer_serializer (tmp_, sizeof (tmp_),
+                             ostream_overflow,
+                             ostream_flush,
+                             &os,
+                             i)
+    {
+    }
+
+    bool buffer_serializer::
+    next (optional<event> e, pair<const char*, size_t> val, bool check)
+    {
+      if (absent_ == 2)
+        goto fail_complete;
+
+      if (e == nullopt)
+      {
+        if (!state_.empty ())
+          goto fail_incomplete;
+
+        absent_++;
+        return false;
+      }
+
+      absent_ = 0; // Clear inter-value absent event.
+
+      {
+        state* st (state_.empty () ? nullptr : &state_.back ());
+
+        auto name_expected = [] (const state& s)
+        {
+          return s.type == event::begin_object && s.count % 2 == 0;
+        };
+
+        auto make_str = [] (const char* s, size_t n)
+        {
+          return make_pair (s, n);
+        };
+
+        // When it comes to pretty-printing, the common way to do it is along
+        // these lines:
+        //
+        // {
+        //   "str": "value",
+        //   "obj": {
+        //     "arr": [
+        //       1,
+        //       2,
+        //       3
+        //     ]
+        //   },
+        //   "num": 123
+        // }
+        //
+        // Empty objects and arrays are printed without a newline:
+        //
+        // {
+        //   "obj": {},
+        //   "arr": []
+        // }
+        //
+        // There are two types of separators: between name and value, which is
+        // always ": ", and before/after value inside an object or array which
+        // is either newline followed by indentation, or comma followed by
+        // newline followed by indentation (we also have separation between
+        // top-level values but that's orthogonal to pretty-printing).
+        //
+        // Based on this observation, we are going to handle the latter case by
+        // starting with the ",\n" string (in this->sep_) and pushing/popping
+        // indentation spaces as we enter/leave objects and arrays. We handle
+        // the cases where we don't need the comma by simply skipping it in the
+        // C-string pointer.
+        //
+        bool pp (indent_ != 0);
+
+        pair<const char*, size_t> sep;
+        if (st != nullptr)
+        {
+          // The name-value separator.
+          //
+          if (st->type == event::begin_object && st->count % 2 == 1)
+          {
+            sep = !pp ? make_str (":", 1) : make_str (": ", 2);
+          }
+          // We don't need the comma if we are closing the object or array.
+          //
+          else if (e == event::end_array || e == event::end_object)
+          {
+            // But in this case we need to unindent one level prior to writing
+            // the brace. Also handle the empty object/array as a special case.
+            //
+            sep = !pp || st->count == 0
+              ? make_str (nullptr, 0)
+              : make_str (sep_.c_str () + 1, sep_.size () - 1 - indent_);
+          }
+          // Or if this is the first value (note: must come after end_*).
+          //
+          else if (st->count == 0)
+          {
+            sep = !pp
+              ? make_str (nullptr, 0)
+              : make_str (sep_.c_str () + 1, sep_.size () - 1);
+          }
+          else
+          {
+            sep = !pp
+              ? make_str (",", 1)
+              : make_str (sep_.c_str (), sep_.size ());
+          }
+        }
+        else if (values_ != 0) // Subsequent top-level value.
+        {
+          // Top-level value separation. For now we always separate them with
+          // newlines, which is the most common/sensible way.
+          //
+          sep = make_str ("\n", 1);
+        }
+
+        switch (*e)
+        {
+        case event::begin_array:
+        case event::begin_object:
+          {
+            if (st != nullptr && name_expected (*st))
+              goto fail_unexpected_event;
+
+            write (*e,
+                   sep,
+                   make_str (e == event::begin_array ? "[" : "{", 1),
+                   false);
+
+            if (st != nullptr)
+              st->count++;
+
+            if (pp)
+              sep_.append (indent_, ' ');
+
+            state_.push_back (state {*e, 0});
+            break;
+          }
+        case event::end_array:
+        case event::end_object:
+          {
+            if (st == nullptr || (e == event::end_array
+                                  ? st->type != event::begin_array
+                                  : !name_expected (*st)))
+              goto fail_unexpected_event;
+
+            write (*e,
+                   sep,
+                   make_str (e == event::end_array ? "]" : "}", 1),
+                   false);
+
+            if (pp)
+              sep_.erase (sep_.size () - indent_);
+
+            state_.pop_back ();
+            break;
+          }
+        case event::name:
+        case event::string:
+          {
+            if (e == event::name
+                ? (st == nullptr || !name_expected (*st))
+                : (st != nullptr && name_expected (*st)))
+              goto fail_unexpected_event;
+
+            write (*e, sep, val, check, '"');
+
+            if (st != nullptr)
+              st->count++;
+            break;
+          }
+        case event::null:
+        case event::boolean:
+          {
+            if (e == event::null && val.first == nullptr)
+              val = {"null", 4};
+            else if (check)
+            {
+              auto eq = [&val] (const char* v, size_t n)
+              {
+                return val.second == n && memcmp (val.first, v, n) == 0;
+              };
+
+              if (e == event::null)
+              {
+                if (!eq ("null", 4))
+                  goto fail_null;
+              }
+              else
+              {
+                if (!eq ("true", 4) && !eq ("false", 5))
+                  goto fail_bool;
+              }
+            }
+          }
+          // Fall through.
+        case event::number:
+          {
+            if (st != nullptr && name_expected (*st))
+              goto fail_unexpected_event;
+
+            write (*e, sep, val, check);
+
+            if (st != nullptr)
+              st->count++;
+            break;
+          }
+        }
+      }
+
+      if (state_.empty ())
+      {
+        values_++;
+        if (flush_ != nullptr)
+          flush_ (data_, *e, buf_);
+
+        return false;
+      }
+
+      return true;
+
+    fail_complete:
+      throw invalid_json_output (
+          e, error_code::invalid_value, "value sequence is complete");
+    fail_incomplete:
+      throw invalid_json_output (
+          e, error_code::invalid_value, "value is incomplete");
+    fail_null:
+      throw invalid_json_output (
+          e, error_code::invalid_value, "invalid null value");
+    fail_bool:
+      throw invalid_json_output (
+          e, error_code::invalid_value, "invalid boolean value");
+    fail_unexpected_event:
+      throw invalid_json_output (
+          e, error_code::unexpected_event, "unexpected event");
+    }
+
+    // JSON escape sequences for control characters <= 0x1F.
+    //
+    static const char* json_escapes[] =
+    {"\\u0000", "\\u0001", "\\u0002", "\\u0003", "\\u0004", "\\u0005",
+     "\\u0006", "\\u0007", "\\b",     "\\t",     "\\n",     "\\u000B",
+     "\\f",     "\\r",     "\\u000E", "\\u000F", "\\u0010", "\\u0011",
+     "\\u0012", "\\u0013", "\\u0014", "\\u0015", "\\u0016", "\\u0017",
+     "\\u0018", "\\u0019", "\\u001A", "\\u001B", "\\u001C", "\\u001D",
+     "\\u001E", "\\u001F"};
+
+    void buffer_serializer::
+    write (event e,
+           pair<const char*, size_t> sep,
+           pair<const char*, size_t> val,
+           bool check,
+           char q)
+    {
+      // Assumptions:
+      //
+      // 1. A call to overflow should be able to provide enough capacity to
+      //    write the entire separator (in other words, we are not going to
+      //    bother with chunking the separator).
+      //
+      // 2. Similarly, a call to overflow should be able to provide enough
+      //    capacity to write an entire UTF-8 multi-byte sequence.
+      //
+      // 3. Performance-wise, we do not expect very long contiguous sequences
+      //    of character that require escaping.
+
+      // Total number of bytes remaining to be written and the capacity
+      // currently available.
+      //
+      size_t size (sep.second + val.second + (q != '\0' ? 2 : 0));
+      size_t cap (buf_.capacity - buf_.size);
+
+      auto grow = [this, e, &size, &cap] (size_t min, size_t extra = 0)
+      {
+        if (overflow_ == nullptr)
+          return false;
+
+        extra += size;
+        extra -= cap;
+        overflow_ (data_, e, buf_, extra > min ? extra : min);
+        cap = buf_.capacity - buf_.size;
+
+        return cap >= min;
+      };
+
+      auto append = [this, &cap, &size] (const char* d, size_t s)
+      {
+        memcpy (static_cast<char*> (buf_.data) + buf_.size, d, s);
+        buf_.size += s;
+        cap -= s;
+        size -= s;
+      };
+
+      // Return the longest chunk of input that fits into the buffer and does
+      // not end in the middle of a multi-byte UTF-8 sequence. Assume value
+      // size and capacity are not 0. Return NULL in first if no chunk could
+      // be found that fits into the remaining space. In this case, second is
+      // the additional (to size) required space (used to handle escapes in
+      // the checked version).
+      //
+      // The basic idea is to seek in the input buffer to the capacity of the
+      // output buffer (unless the input is shorter than the output). If we
+      // ended up in the middle of a multi-byte UTF-8 sequence, then seek back
+      // until we end up at the UTF-8 sequence boundary. Note that this
+      // implementation assumes valid UTF-8.
+      //
+      auto chunk = [&cap, &val] () -> pair<const char*, size_t>
+      {
+        pair<const char*, size_t> r (nullptr, 0);
+
+        if (cap >= val.second)
+          r = val;
+        else
+        {
+          // Start from the character past capacity and search for a UTF-8
+          // sequence boundary.
+          //
+          for (const char* p (val.first + cap); p != val.first; --p)
+          {
+            const auto u (static_cast<uint8_t> (*p));
+            if (u < 0x80 || u > 0xBF) // Not a continuation byte
+            {
+              r = {val.first, p - val.first};
+              break;
+            }
+          }
+        }
+
+        val.first += r.second;
+        val.second -= r.second;
+
+        return r;
+      };
+
+      // Escaping and UTF-8-validating version of chunk().
+      //
+      // There are three classes of mandatory escapes in a JSON string:
+      //
+      // - \\ and \"
+      //
+      // - \b \f \n \r \t for popular control characters
+      //
+      // - \u00NN for other control characters <= 0x1F
+      //
+      // If the input begins with a character that must be escaped, return
+      // only its escape sequence. Otherwise validate and return everything up
+      // to the end of input or buffer capacity, but cutting it short before
+      // the next character that must be escaped or the first UTF-8 sequence
+      // that would not fit.
+      //
+      // Return string::npos in second in case of a stray continuation byte or
+      // any byte in an invalid UTF-8 range (for example, an "overlong" 2-byte
+      // encoding of a 7-bit/ASCII character or a 4-, 5-, or 6-byte sequence
+      // that would encode a codepoint beyond the U+10FFFF Unicode limit).
+      //
+      auto chunk_checked = [&cap, &size, &val] () -> pair<const char*, size_t>
+      {
+        pair<const char*, size_t> r (nullptr, 0);
+
+        // Check whether the first character needs to be escaped.
+        //
+        const uint8_t c (val.first[0]);
+        if (c == '"')
+          r = {"\\\"", 2};
+        else if (c == '\\')
+          r = {"\\\\", 2};
+        else if (c <= 0x1F)
+        {
+          auto s (json_escapes[c]);
+          r = {s, s[1] == 'u' ? 6 : 2};
+        }
+
+        if (r.first != nullptr)
+        {
+          // Return in second the additional (to size) space required.
+          //
+          if (r.second > cap)
+            return {nullptr, r.second - 1};
+
+          // If we had to escape the character then adjust size accordingly
+          // (see append() above).
+          //
+          size += r.second - 1;
+
+          val.first += 1;
+          val.second -= 1;
+          return r;
+        }
+
+        // First character doesn't need to be escaped. Return as much of the
+        // rest of the input as possible.
+        //
+        size_t i (0);
+        for (size_t n (min (cap, val.second)); i != n; i++)
+        {
+          const uint8_t c1 (val.first[i]);
+
+          if (c1 == '"' || c1 == '\\' || c1 <= 0x1F) // Needs to be escaped.
+            break;
+          else if (c1 >= 0x80) // Not ASCII, so validate as a UTF-8 sequence.
+          {
+            size_t i1 (i); // Position of the first byte.
+
+            // The control flow here is to continue if valid and to fall
+            // through to return on error.
+            //
+            if (c1 >= 0xC2 && c1 <= 0xDF) // 2-byte sequence.
+            {
+              if (i + 2 <= val.second) // Sequence is complete in JSON value.
+              {
+                if (i + 2 > cap) // Sequence won't fit.
+                  break;
+
+                const uint8_t c2 (val.first[++i]);
+
+                if (c2 >= 0x80 && c2 <= 0xBF)
+                  continue;
+              }
+            }
+            else if (c1 >= 0xE0 && c1 <= 0xEF) // 3-byte sequence.
+            {
+              if (i + 3 <= val.second)
+              {
+                if (i + 3 > cap)
+                  break;
+
+                const uint8_t c2 (val.first[++i]), c3 (val.first[++i]);
+
+                if (c3 >= 0x80 && c3 <= 0xBF)
+                {
+                  switch (c1)
+                  {
+                  case 0xE0: if (c2 >= 0xA0 && c2 <= 0xBF) continue; break;
+                  case 0xED: if (c2 >= 0x80 && c2 <= 0x9F) continue; break;
+                  default:   if (c2 >= 0x80 && c2 <= 0xBF) continue; break;
+                  }
+                }
+              }
+            }
+            else if (c1 >= 0xF0 && c1 <= 0xF4) // 4-byte sequence.
+            {
+              if (i + 4 <= val.second)
+              {
+                if (i + 4 > cap)
+                  break;
+
+                const uint8_t c2 (val.first[++i]),
+                              c3 (val.first[++i]),
+                              c4 (val.first[++i]);
+
+                if (c3 >= 0x80 && c3 <= 0xBF &&
+                    c4 >= 0x80 && c4 <= 0xBF)
+                {
+                  switch (c1)
+                  {
+                  case 0xF0: if (c2 >= 0x90 && c2 <= 0xBF) continue; break;
+                  case 0xF4: if (c2 >= 0x80 && c2 <= 0x8F) continue; break;
+                  default:   if (c2 >= 0x80 && c2 <= 0xBF) continue; break;
+                  }
+                }
+              }
+            }
+
+            r = {val.first, string::npos};
+
+            // Update val to point to the beginning of the invalid sequence.
+            //
+            val.first += i1;
+            val.second -= i1;
+
+            return r;
+          }
+        }
+
+        if (i != 0) // We have a chunk.
+        {
+          r = {val.first, i};
+
+          val.first += i;
+          val.second -= i;
+        }
+
+        return r;
+      };
+
+      // Value's original size (used to calculate the offset of the errant
+      // character in case of a validation failure).
+      //
+      const size_t vn (val.second);
+
+      // Write the separator, if any.
+      //
+      if (sep.second != 0)
+      {
+        if (cap < sep.second && !grow (sep.second))
+          goto fail_nospace;
+
+        append (sep.first, sep.second);
+      }
+
+      // Write the value's opening quote, if requested.
+      //
+      if (q != '\0')
+      {
+        if (cap == 0 && !grow (1))
+          goto fail_nospace;
+
+        append ("\"", 1);
+      }
+
+      // Write the value, unless empty.
+      //
+      while (val.second != 0)
+      {
+        pair<const char*, size_t> ch (nullptr, 0);
+
+        if (cap != 0)
+          ch = check ? chunk_checked () : chunk ();
+
+        if (ch.first == nullptr)
+        {
+          // The minimum extra bytes we need the overflow function to be able
+          // to provide is based on these sequences that we do not break:
+          //
+          // - 4 bytes for a UTF-8 sequence
+          // - 6 bytes for an escaped Unicode sequence (\uXXXX).
+          //
+          if (!grow (6, ch.second))
+            goto fail_nospace;
+        }
+        else if (ch.second != string::npos)
+          append (ch.first, ch.second);
+        else
+          goto fail_utf8;
+      }
+
+      // Write the value's closing quote, if requested.
+      //
+      if (q != '\0')
+      {
+        if (cap == 0 && !grow (1))
+          goto fail_nospace;
+
+        append ("\"", 1);
+      }
+
+      return;
+
+      // Note: keep descriptions consistent with the parser.
+      //
+    fail_utf8:
+      throw invalid_json_output (e,
+                                 e == event::name ? error_code::invalid_name
+                                                  : error_code::invalid_value,
+                                 "invalid UTF-8 text",
+                                 vn - val.second);
+
+    fail_nospace:
+      throw invalid_json_output (
+          e, error_code::buffer_overflow, "insufficient space in buffer");
+    }
+
+    size_t buffer_serializer::
+    to_chars_impl (char* b, size_t n, const char* f, ...)
+    {
+      va_list a;
+      va_start (a, f);
+      const int r (vsnprintf (b, n, f, a));
+      va_end (a);
+
+      if (r < 0 || r >= static_cast<int> (n))
+      {
+        throw invalid_json_output (event::number,
+                                   error_code::invalid_value,
+                                   "unable to convert number to string");
+      }
+
+      return static_cast<size_t> (r);
+    }
+  }
+}
diff --git a/libbutl/json/serializer.hxx b/libbutl/json/serializer.hxx
new file mode 100644
index 0000000..fad91e2
--- /dev/null
+++ b/libbutl/json/serializer.hxx
@@ -0,0 +1,379 @@
+#pragma once
+
+#ifdef BUILD2_BOOTSTRAP
+#  error JSON serializer not available during bootstrap
+#endif
+
+#include <array>
+#include <iosfwd>
+#include <string>
+#include <vector>
+#include <cstddef>     // size_t, nullptr_t
+#include <utility>     // pair
+#include <stdexcept>   // invalid_argument
+#include <type_traits> // enable_if, is_*
+
+#include <libbutl/optional.hxx> // butl::optional is std::optional or similar.
+
+#include <libbutl/json/event.hxx>
+
+#include <libbutl/export.hxx>
+
+namespace butl
+{
+  // Using the RFC8259 terminology: JSON (output) text, JSON value, object
+  // member.
+  //
+  namespace json
+  {
+    class invalid_json_output: public std::invalid_argument
+    {
+    public:
+      using event_type = json::event;
+
+      enum class error_code
+      {
+        buffer_overflow,
+        unexpected_event,
+        invalid_name,
+        invalid_value
+      };
+
+      invalid_json_output (optional<event_type> event,
+                           error_code code,
+                           const char* description,
+                           std::size_t offset = std::string::npos);
+
+      invalid_json_output (optional<event_type> event,
+                           error_code code,
+                           const std::string& description,
+                           std::size_t offset = std::string::npos);
+
+      // Event that triggered the error. If the error is in the value, then
+      // offset points to the offending byte (for example, the beginning of an
+      // invalid UTF-8 byte sequence). Otherwise, offset is string::npos.
+      //
+      optional<event_type> event;
+      error_code           code;
+      std::size_t          offset;
+    };
+
+    // The serializer makes sure the resulting JSON is syntactically but not
+    // necessarily semantically correct. For example, it's possible to
+    // serialize a number event with non-numeric data.
+    //
+    // Note that unlike the parser, the serializer is always in the multi-
+    // value mode allowing the serialization of zero or more values. Note also
+    // that while values are separated with newlines, there is no trailing
+    // newline after the last (or only) value and the user is expected to add
+    // it manually if needed.
+    //
+    // Also note that while RFC8259 recommends object members to have unique
+    // names, the serializer does not enforce this.
+    //
+    class LIBBUTL_SYMEXPORT buffer_serializer
+    {
+    public:
+      // Serialize to string growing it as necessary.
+      //
+      // The indentation argument specifies the number of indentation spaces
+      // that should be used for pretty-printing. If 0 is passed, no
+      // pretty-printing is performed.
+      //
+      explicit
+      buffer_serializer (std::string&, std::size_t indentation = 2);
+
+      // Serialize to vector of characters growing it as necessary.
+      //
+      explicit
+      buffer_serializer (std::vector<char>&, std::size_t indentation = 2);
+
+      // Serialize to a fixed array.
+      //
+      // The length of the output text written is tracked in the size
+      // argument.
+      //
+      // If the array is not big enough to store the entire output text, the
+      // next() call that reaches the limit will throw invalid_json_output.
+      //
+      template <std::size_t N>
+      buffer_serializer (std::array<char, N>&, std::size_t& size,
+                         std::size_t indentation = 2);
+
+      // Serialize to a fixed buffer.
+      //
+      // The length of the output text written is tracked in the size
+      // argument.
+      //
+      // If the buffer is not big enough to store the entire output text, the
+      // next() call that reaches the limit will throw invalid_json_output.
+      //
+      buffer_serializer (void* buf, std::size_t& size, std::size_t capacity,
+                         std::size_t indentation = 2);
+
+      // The overflow function is called when the output buffer is out of
+      // space. The extra argument is a hint indicating the extra space likely
+      // to be required.
+      //
+      // Possible strategies include re-allocating a larger buffer or flushing
+      // the contents of the original buffer to the output destination. In
+      // case of a reallocation, the implementation is responsible for copying
+      // the contents of the original buffer over.
+      //
+      // The flush function is called when the complete JSON value has been
+      // serialized to the buffer. It can be used to write the contents of the
+      // buffer to the output destination. Note that flush is not called after
+      // the second absent (nullopt) event (or the only absent event; see
+      // next() for details).
+      //
+      // Both functions are passed the original buffer, its size (the amount
+      // of output text), and its capacity. They return (by modifying the
+      // argument) the replacement buffer and its size and capacity (these may
+      // refer to the original buffer). If space cannot be made available, the
+      // implementation can throw an appropriate exception (for example,
+      // std::bad_alloc or std::ios_base::failure). Any exceptions thrown is
+      // propagated to the user.
+      //
+      struct buffer
+      {
+        void*        data;
+        std::size_t& size;
+        std::size_t  capacity;
+      };
+
+      using overflow_function = void (void* data,
+                                      event,
+                                      buffer&,
+                                      std::size_t extra);
+      using flush_function    = void (void* data, event, buffer&);
+
+      // Serialize using a custom buffer and overflow/flush functions (both
+      // are optional).
+      //
+      buffer_serializer (void* buf, std::size_t capacity,
+                         overflow_function*,
+                         flush_function*,
+                         void* data,
+                         std::size_t indentation = 2);
+
+      // As above but the length of the output text written is tracked in the
+      // size argument.
+      //
+      buffer_serializer (void* buf, std::size_t& size, std::size_t capacity,
+                         overflow_function*,
+                         flush_function*,
+                         void* data,
+                         std::size_t indentation = 2);
+
+      // Begin/end an object.
+      //
+      void
+      begin_object ();
+
+      void
+      end_object ();
+
+      // Serialize an object member (name and value).
+      //
+      // If check is false, then don't check whether the name (or value, if
+      // it's a string) is valid UTF-8 and don't escape any characters.
+      //
+      template <typename T>
+      void
+      member (const char* name, const T& value, bool check = true);
+
+      template <typename T>
+      void
+      member (const std::string& name, const T& value, bool check = true);
+
+      // Serialize an object member name.
+      //
+      // If check is false, then don't check whether the name is valid UTF-8
+      // and don't escape any characters.
+      //
+      void
+      member_name (const char*, bool check = true);
+
+      void
+      member_name (const std::string&, bool check = true);
+
+      // Begin/end an array.
+      //
+      void
+      begin_array ();
+
+      void
+      end_array ();
+
+      // Serialize a string.
+      //
+      // If check is false, then don't check whether the value is valid UTF-8
+      // and don't escape any characters.
+      //
+      // Note that a NULL C-string pointer is serialized as a null value.
+      //
+      void
+      value (const char*, bool check = true);
+
+      void
+      value (const std::string&, bool check = true);
+
+      // Serialize a number.
+      //
+      template <typename T>
+      typename std::enable_if<std::is_integral<T>::value ||
+                              std::is_floating_point<T>::value>::type
+      value (T);
+
+      // Serialize a boolean value.
+      //
+      void
+      value (bool);
+
+      // Serialize a null value.
+      //
+      void
+      value (std::nullptr_t);
+
+      // Serialize next JSON event.
+      //
+      // If check is false, then don't check whether the value is valid UTF-8
+      // and don't escape any characters.
+      //
+      // Return true if more events are required to complete the (top-level)
+      // value (that is, it is currently incomplete) and false otherwise.
+      // Throw invalid_json_output exception in case of an invalid event or
+      // value.
+      //
+      // At the end of the value an optional absent (nullopt) event can be
+      // serialized to verify the value is complete. If it is incomplete an
+      // invalid_json_output exception is thrown. An optional followup absent
+      // event can be serialized to indicate the completion of a multi-value
+      // sequence (one and only absent event indicates a zero value sequence).
+      // If anything is serialized to a complete value sequence an
+      // invalid_json_output exception is thrown.
+      //
+      // Note that this function was designed to be easily invoked with the
+      // output from parser::next() and parser::data(). For example, for a
+      // single-value mode:
+      //
+      //   optional<event> e;
+      //   do
+      //   {
+      //     e = p.next ();
+      //     s.next (e, p.data ());
+      //   }
+      //   while (e);
+      //
+      // For a multi-value mode:
+      //
+      //   while (p.peek ())
+      //   {
+      //     optional<event> e;
+      //     do
+      //     {
+      //       e = p.next ();
+      //       s.next (e, p.data ());
+      //     }
+      //     while (e);
+      //   }
+      //   s.next (nullopt); // End of value sequence.
+      //
+      bool
+      next (optional<event> event,
+            std::pair<const char*, std::size_t> value = {},
+            bool check = true);
+
+    private:
+      void
+      write (event,
+             std::pair<const char*, std::size_t> sep,
+             std::pair<const char*, std::size_t> val,
+             bool check, char quote = '\0');
+
+      // Forward a value(v, check) call to value(v) ignoring the check
+      // argument. Used in the member() implementation.
+      //
+      template <typename T>
+      void
+      value (const T& v, bool /*check*/)
+      {
+        value (v);
+      }
+
+      // Convert numbers to string.
+      //
+      static std::size_t to_chars (char*, std::size_t, int);
+      static std::size_t to_chars (char*, std::size_t, long);
+      static std::size_t to_chars (char*, std::size_t, long long);
+      static std::size_t to_chars (char*, std::size_t, unsigned int);
+      static std::size_t to_chars (char*, std::size_t, unsigned long);
+      static std::size_t to_chars (char*, std::size_t, unsigned long long);
+      static std::size_t to_chars (char*, std::size_t, double);
+      static std::size_t to_chars (char*, std::size_t, long double);
+
+      static std::size_t to_chars_impl (char*, size_t, const char* fmt, ...);
+
+      buffer buf_;
+      std::size_t size_;
+      overflow_function* overflow_;
+      flush_function* flush_;
+      void* data_;
+
+      // State of a "structured type" (array or object; as per the RFC
+      // terminology).
+      //
+      struct state
+      {
+        const event type;  // Type kind (begin_array or begin_object).
+        std::size_t count; // Number of events serialized inside this type.
+      };
+
+      // Stack of nested structured type states.
+      //
+      // @@ TODO: would have been nice to use small_vector.
+      //
+      std::vector<state> state_;
+
+      // The number of consecutive absent events (nullopt) serialized thus
+      // far.
+      //
+      // Note: initialized to 1 to naturally handle a single absent event
+      // (declares an empty value sequence complete).
+      //
+      std::size_t absent_ = 1;
+
+      // The number of spaces with which to indent (once for each level of
+      // nesting). If zero, pretty-printing is disabled.
+      //
+      std::size_t indent_;
+
+      // Separator and indentation before/after value inside an object or
+      // array (see pretty-printing implementation for details).
+      //
+      std::string sep_;
+
+      // The number of complete top-level values serialized thus far.
+      //
+      std::size_t values_ = 0;
+    };
+
+    class LIBBUTL_SYMEXPORT stream_serializer: public buffer_serializer
+    {
+    public:
+      // Serialize to std::ostream.
+      //
+      // If stream exceptions are enabled then the std::ios_base::failure
+      // exception is used to report input/output errors (badbit and failbit).
+      // Otherwise, those are reported as the invalid_json_output exception.
+      //
+      explicit
+      stream_serializer (std::ostream&, std::size_t indentation = 2);
+
+    protected:
+      char tmp_[4096];
+    };
+  }
+}
+
+#include <libbutl/json/serializer.ixx>
diff --git a/libbutl/json/serializer.ixx b/libbutl/json/serializer.ixx
new file mode 100644
index 0000000..5b2c173
--- /dev/null
+++ b/libbutl/json/serializer.ixx
@@ -0,0 +1,202 @@
+namespace butl
+{
+  namespace json
+  {
+    inline invalid_json_output::
+    invalid_json_output (optional<event_type> e,
+                         error_code c,
+                         const char* d,
+                         std::size_t o)
+        : std::invalid_argument (d), event (e), code (c), offset (o)
+    {
+    }
+
+    inline invalid_json_output::
+    invalid_json_output (optional<event_type> e,
+                         error_code c,
+                         const std::string& d,
+                         std::size_t o)
+        : invalid_json_output (e, c, d.c_str (), o)
+    {
+    }
+
+    inline buffer_serializer::
+    buffer_serializer (void* b, std::size_t& s, std::size_t c,
+                       overflow_function* o, flush_function* f, void* d,
+                       std::size_t i)
+        : buf_ {b, s, c},
+          overflow_ (o),
+          flush_ (f),
+          data_ (d),
+          indent_ (i),
+          sep_ (indent_ != 0 ? ",\n" : "")
+    {
+    }
+
+    template <std::size_t N>
+    inline buffer_serializer::
+    buffer_serializer (std::array<char, N>& a, std::size_t& s, std::size_t i)
+        : buffer_serializer (a.data (), s, a.size (),
+                             nullptr, nullptr, nullptr,
+                             i)
+    {
+    }
+
+    inline buffer_serializer::
+    buffer_serializer (void* b, std::size_t& s, std::size_t c, std::size_t i)
+        : buffer_serializer (b, s, c, nullptr, nullptr, nullptr, i)
+    {
+    }
+
+    inline buffer_serializer::
+    buffer_serializer (void* b, std::size_t c,
+                       overflow_function* o, flush_function* f, void* d,
+                       std::size_t i)
+        : buffer_serializer (b, size_, c, o, f, d, i)
+    {
+      size_ = 0;
+    }
+
+    inline void buffer_serializer::
+    begin_object ()
+    {
+      next (event::begin_object);
+    }
+
+    inline void buffer_serializer::
+    end_object ()
+    {
+      next (event::end_object);
+    }
+
+    inline void buffer_serializer::
+    member_name (const char* n, bool c)
+    {
+      next (event::name, {n, n != nullptr ? strlen (n) : 0}, c);
+    }
+
+    inline void buffer_serializer::
+    member_name (const std::string& n, bool c)
+    {
+      next (event::name, {n.c_str (), n.size ()}, c);
+    }
+
+    template <typename T>
+    inline void buffer_serializer::
+    member (const char* n, const T& v, bool c)
+    {
+      member_name (n, c);
+      value (v, c);
+    }
+
+    template <typename T>
+    inline void buffer_serializer::
+    member (const std::string& n, const T& v, bool c)
+    {
+      member_name (n, c);
+      value (v, c);
+    }
+
+    inline void buffer_serializer::
+    begin_array ()
+    {
+      next (event::begin_array);
+    }
+
+    inline void buffer_serializer::
+    end_array ()
+    {
+      next (event::end_array);
+    }
+
+    inline void buffer_serializer::
+    value (const char* v, bool c)
+    {
+      if (v != nullptr)
+        next (event::string, {v, strlen (v)}, c);
+      else
+        next (event::null);
+    }
+
+    inline void buffer_serializer::
+    value (const std::string& v, bool c)
+    {
+      next (event::string, {v.c_str (), v.size ()}, c);
+    }
+
+    template <typename T>
+    typename std::enable_if<std::is_integral<T>::value ||
+                            std::is_floating_point<T>::value>::type
+    buffer_serializer::
+    value (T v)
+    {
+      // The largest 128-bit integer has 39 digits, and long floating point
+      // numbers will fit because they are output in scientific notation.
+      //
+      char b[40];
+      const std::size_t n (to_chars (b, sizeof (b), v));
+      next (event::number, {b, n});
+    }
+
+    inline void buffer_serializer::
+    value (bool b)
+    {
+      next (event::boolean,
+            b ? std::make_pair ("true", 4) : std::make_pair ("false", 5));
+    }
+
+    inline void buffer_serializer::
+    value (std::nullptr_t)
+    {
+      next (event::null);
+    }
+
+    inline size_t buffer_serializer::
+    to_chars (char* b, size_t s, int v)
+    {
+      return to_chars_impl (b, s, "%d", v);
+    }
+
+    inline size_t buffer_serializer::
+    to_chars (char* b, size_t s, long v)
+    {
+      return to_chars_impl (b, s, "%ld", v);
+    }
+
+    inline size_t buffer_serializer::
+    to_chars (char* b, size_t s, long long v)
+    {
+      return to_chars_impl (b, s, "%lld", v);
+    }
+
+    inline size_t buffer_serializer::
+    to_chars (char* b, size_t s, unsigned v)
+    {
+      return to_chars_impl (b, s, "%u", v);
+    }
+
+    inline size_t buffer_serializer::
+    to_chars (char* b, size_t s, unsigned long v)
+    {
+      return to_chars_impl (b, s, "%lu", v);
+    }
+
+    inline size_t buffer_serializer::
+    to_chars (char* b, size_t s, unsigned long long v)
+    {
+      return to_chars_impl (b, s, "%llu", v);
+    }
+
+    inline size_t buffer_serializer::
+    to_chars (char* b, size_t s, double v)
+    {
+      return to_chars_impl (b, s, "%.10g", v);
+    }
+
+    inline size_t buffer_serializer::
+    to_chars (char* b, size_t s, long double v)
+    {
+      return to_chars_impl (b, s, "%.10Lg", v);
+    }
+  }
+}
author	Boris Kolpackov <boris@codesynthesis.com>	2022-02-28 08:57:02 +0200
committer	Boris Kolpackov <boris@codesynthesis.com>	2022-03-02 13:58:55 +0200
commit	152ea943395822f55591eadaf8e0f5aac263db5e (patch)
tree	268a9dae37704b2398876dad44a5095ba6d6f750
parent	0e5d575feceea4feac4b33e85626719e14f762a1 (diff)