From d816b9db85a66739bdb2c2b1e7d8475ce6379484 Mon Sep 17 00:00:00 2001 From: Boris Kolpackov Date: Mon, 28 Feb 2022 08:57:02 +0200 Subject: Add JSON serializer (copy of libstud-json) --- libbutl/json/serializer.cxx | 667 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 667 insertions(+) create mode 100644 libbutl/json/serializer.cxx (limited to 'libbutl/json/serializer.cxx') diff --git a/libbutl/json/serializer.cxx b/libbutl/json/serializer.cxx new file mode 100644 index 0000000..6f9bc95 --- /dev/null +++ b/libbutl/json/serializer.cxx @@ -0,0 +1,667 @@ +#include // snprintf +#include // va_list +#include // memcpy +#include + +#include + +using namespace std; + +namespace butl +{ + namespace json + { + using buffer = buffer_serializer::buffer; + using error_code = invalid_json_output::error_code; + + template + static void + dynarray_overflow (void* d, event, buffer& b, size_t ex) + { + T& v (*static_cast (d)); + v.resize (b.capacity + ex); + v.resize (v.capacity ()); + // const_cast is required for std::string pre C++17. + // + b.data = const_cast (v.data ()); + b.capacity = v.size (); + } + + template + static void + dynarray_flush (void* d, event, buffer& b) + { + T& v (*static_cast (d)); + v.resize (b.size); + b.data = const_cast (v.data ()); + b.capacity = b.size; + } + + buffer_serializer:: + buffer_serializer (string& s, size_t i) + : buffer_serializer (const_cast (s.data ()), size_, s.size (), + dynarray_overflow, + dynarray_flush, + &s, + i) + { + size_ = s.size (); + } + + buffer_serializer:: + buffer_serializer (vector& v, size_t i) + : buffer_serializer (v.data (), size_, v.size (), + dynarray_overflow>, + dynarray_flush>, + &v, + i) + { + size_ = v.size (); + } + + static void + ostream_overflow (void* d, event e, buffer& b, size_t) + { + ostream& s (*static_cast (d)); + s.write (static_cast (b.data), b.size); + if (s.fail ()) + throw invalid_json_output ( + e, error_code::buffer_overflow, "unable to write JSON output text"); + b.size = 0; + } + + static void + ostream_flush (void* d, event e, buffer& b) + { + ostream_overflow (d, e, b, 0); + + ostream& s (*static_cast (d)); + s.flush (); + if (s.fail ()) + throw invalid_json_output ( + e, error_code::buffer_overflow, "unable to write JSON output text"); + } + + stream_serializer:: + stream_serializer (ostream& os, size_t i) + : buffer_serializer (tmp_, sizeof (tmp_), + ostream_overflow, + ostream_flush, + &os, + i) + { + } + + bool buffer_serializer:: + next (optional e, pair val, bool check) + { + if (absent_ == 2) + goto fail_complete; + + if (e == nullopt) + { + if (!state_.empty ()) + goto fail_incomplete; + + absent_++; + return false; + } + + absent_ = 0; // Clear inter-value absent event. + + { + state* st (state_.empty () ? nullptr : &state_.back ()); + + auto name_expected = [] (const state& s) + { + return s.type == event::begin_object && s.count % 2 == 0; + }; + + auto make_str = [] (const char* s, size_t n) + { + return make_pair (s, n); + }; + + // When it comes to pretty-printing, the common way to do it is along + // these lines: + // + // { + // "str": "value", + // "obj": { + // "arr": [ + // 1, + // 2, + // 3 + // ] + // }, + // "num": 123 + // } + // + // Empty objects and arrays are printed without a newline: + // + // { + // "obj": {}, + // "arr": [] + // } + // + // There are two types of separators: between name and value, which is + // always ": ", and before/after value inside an object or array which + // is either newline followed by indentation, or comma followed by + // newline followed by indentation (we also have separation between + // top-level values but that's orthogonal to pretty-printing). + // + // Based on this observation, we are going to handle the latter case by + // starting with the ",\n" string (in this->sep_) and pushing/popping + // indentation spaces as we enter/leave objects and arrays. We handle + // the cases where we don't need the comma by simply skipping it in the + // C-string pointer. + // + bool pp (indent_ != 0); + + pair sep; + if (st != nullptr) + { + // The name-value separator. + // + if (st->type == event::begin_object && st->count % 2 == 1) + { + sep = !pp ? make_str (":", 1) : make_str (": ", 2); + } + // We don't need the comma if we are closing the object or array. + // + else if (e == event::end_array || e == event::end_object) + { + // But in this case we need to unindent one level prior to writing + // the brace. Also handle the empty object/array as a special case. + // + sep = !pp || st->count == 0 + ? make_str (nullptr, 0) + : make_str (sep_.c_str () + 1, sep_.size () - 1 - indent_); + } + // Or if this is the first value (note: must come after end_*). + // + else if (st->count == 0) + { + sep = !pp + ? make_str (nullptr, 0) + : make_str (sep_.c_str () + 1, sep_.size () - 1); + } + else + { + sep = !pp + ? make_str (",", 1) + : make_str (sep_.c_str (), sep_.size ()); + } + } + else if (values_ != 0) // Subsequent top-level value. + { + // Top-level value separation. For now we always separate them with + // newlines, which is the most common/sensible way. + // + sep = make_str ("\n", 1); + } + + switch (*e) + { + case event::begin_array: + case event::begin_object: + { + if (st != nullptr && name_expected (*st)) + goto fail_unexpected_event; + + write (*e, + sep, + make_str (e == event::begin_array ? "[" : "{", 1), + false); + + if (st != nullptr) + st->count++; + + if (pp) + sep_.append (indent_, ' '); + + state_.push_back (state {*e, 0}); + break; + } + case event::end_array: + case event::end_object: + { + if (st == nullptr || (e == event::end_array + ? st->type != event::begin_array + : !name_expected (*st))) + goto fail_unexpected_event; + + write (*e, + sep, + make_str (e == event::end_array ? "]" : "}", 1), + false); + + if (pp) + sep_.erase (sep_.size () - indent_); + + state_.pop_back (); + break; + } + case event::name: + case event::string: + { + if (e == event::name + ? (st == nullptr || !name_expected (*st)) + : (st != nullptr && name_expected (*st))) + goto fail_unexpected_event; + + write (*e, sep, val, check, '"'); + + if (st != nullptr) + st->count++; + break; + } + case event::null: + case event::boolean: + { + if (e == event::null && val.first == nullptr) + val = {"null", 4}; + else if (check) + { + auto eq = [&val] (const char* v, size_t n) + { + return val.second == n && memcmp (val.first, v, n) == 0; + }; + + if (e == event::null) + { + if (!eq ("null", 4)) + goto fail_null; + } + else + { + if (!eq ("true", 4) && !eq ("false", 5)) + goto fail_bool; + } + } + } + // Fall through. + case event::number: + { + if (st != nullptr && name_expected (*st)) + goto fail_unexpected_event; + + write (*e, sep, val, check); + + if (st != nullptr) + st->count++; + break; + } + } + } + + if (state_.empty ()) + { + values_++; + if (flush_ != nullptr) + flush_ (data_, *e, buf_); + + return false; + } + + return true; + + fail_complete: + throw invalid_json_output ( + e, error_code::invalid_value, "value sequence is complete"); + fail_incomplete: + throw invalid_json_output ( + e, error_code::invalid_value, "value is incomplete"); + fail_null: + throw invalid_json_output ( + e, error_code::invalid_value, "invalid null value"); + fail_bool: + throw invalid_json_output ( + e, error_code::invalid_value, "invalid boolean value"); + fail_unexpected_event: + throw invalid_json_output ( + e, error_code::unexpected_event, "unexpected event"); + } + + // JSON escape sequences for control characters <= 0x1F. + // + static const char* json_escapes[] = + {"\\u0000", "\\u0001", "\\u0002", "\\u0003", "\\u0004", "\\u0005", + "\\u0006", "\\u0007", "\\b", "\\t", "\\n", "\\u000B", + "\\f", "\\r", "\\u000E", "\\u000F", "\\u0010", "\\u0011", + "\\u0012", "\\u0013", "\\u0014", "\\u0015", "\\u0016", "\\u0017", + "\\u0018", "\\u0019", "\\u001A", "\\u001B", "\\u001C", "\\u001D", + "\\u001E", "\\u001F"}; + + void buffer_serializer:: + write (event e, + pair sep, + pair val, + bool check, + char q) + { + // Assumptions: + // + // 1. A call to overflow should be able to provide enough capacity to + // write the entire separator (in other words, we are not going to + // bother with chunking the separator). + // + // 2. Similarly, a call to overflow should be able to provide enough + // capacity to write an entire UTF-8 multi-byte sequence. + // + // 3. Performance-wise, we do not expect very long contiguous sequences + // of character that require escaping. + + // Total number of bytes remaining to be written and the capacity + // currently available. + // + size_t size (sep.second + val.second + (q != '\0' ? 2 : 0)); + size_t cap (buf_.capacity - buf_.size); + + auto grow = [this, e, &size, &cap] (size_t min, size_t extra = 0) + { + if (overflow_ == nullptr) + return false; + + overflow_ (data_, e, buf_, size + extra - cap); + cap = buf_.capacity - buf_.size; + + return cap >= min; + }; + + auto append = [this, &cap, &size] (const char* d, size_t s) + { + memcpy (static_cast (buf_.data) + buf_.size, d, s); + buf_.size += s; + cap -= s; + size -= s; + }; + + // Return the longest chunk of input that fits into the buffer and does + // not end in the middle of a multi-byte UTF-8 sequence. Assume value + // size and capacity are not 0. Return NULL in first if no chunk could + // be found that fits into the remaining space. In this case, second is + // the additional (to size) required space (used to handle escapes in + // the checked version). + // + // The basic idea is to seek in the input buffer to the capacity of the + // output buffer (unless the input is shorter than the output). If we + // ended up in the middle of a multi-byte UTF-8 sequence, then seek back + // until we end up at the UTF-8 sequence boundary. Note that this + // implementation assumes valid UTF-8. + // + auto chunk = [&cap, &val] () -> pair + { + pair r (nullptr, 0); + + if (cap >= val.second) + r = val; + else + { + // Start from the character past capacity and search for a UTF-8 + // sequence boundary. + // + for (const char* p (val.first + cap); p != val.first; --p) + { + const auto u (static_cast (*p)); + if (u < 0x80 || u > 0xBF) // Not a continuation byte + { + r = {val.first, p - val.first}; + break; + } + } + } + + val.first += r.second; + val.second -= r.second; + + return r; + }; + + // Escaping and UTF-8-validating version of chunk(). + // + // There are three classes of mandatory escapes in a JSON string: + // + // - \\ and \" + // + // - \b \f \n \r \t for popular control characters + // + // - \u00NN for other control characters <= 0x1F + // + // If the input begins with a character that must be escaped, return + // only its escape sequence. Otherwise validate and return everything up + // to the end of input or buffer capacity, but cutting it short before + // the next character that must be escaped or the first UTF-8 sequence + // that would not fit. + // + // Return string::npos in second in case of a stray continuation byte or + // any byte in an invalid UTF-8 range (for example, an "overlong" 2-byte + // encoding of a 7-bit/ASCII character or a 4-, 5-, or 6-byte sequence + // that would encode a codepoint beyond the U+10FFFF Unicode limit). + // + auto chunk_checked = [&cap, &size, &val] () -> pair + { + pair r (nullptr, 0); + + // Check whether the first character needs to be escaped. + // + const uint8_t c (val.first[0]); + if (c == '"') + r = {"\\\"", 2}; + else if (c == '\\') + r = {"\\\\", 2}; + else if (c <= 0x1F) + { + auto s (json_escapes[c]); + r = {s, s[1] == 'u' ? 6 : 2}; + } + + if (r.first != nullptr) + { + // Return in second the additional (to size) space required. + // + if (r.second > cap) + return {nullptr, r.second - 1}; + + // If we had to escape the character then adjust size accordingly + // (see append() above). + // + size += r.second - 1; + + val.first += 1; + val.second -= 1; + return r; + } + + // First character doesn't need to be escaped. Return as much of the + // rest of the input as possible. + // + size_t i (0); + for (size_t n (min (cap, val.second)); i != n; i++) + { + const uint8_t c1 (val.first[i]); + + if (c1 == '"' || c1 == '\\' || c1 <= 0x1F) // Needs to be escaped. + break; + else if (c1 >= 0x80) // Not ASCII, so validate as a UTF-8 sequence. + { + size_t i1 (i); // Position of the first byte. + + // The control flow here is to continue if valid and to fall + // through to return on error. + // + if (c1 >= 0xC2 && c1 <= 0xDF) // 2-byte sequence. + { + if (i + 2 <= val.second) // Sequence is complete in JSON value. + { + if (i + 2 > cap) // Sequence won't fit. + break; + + const uint8_t c2 (val.first[++i]); + + if (c2 >= 0x80 && c2 <= 0xBF) + continue; + } + } + else if (c1 >= 0xE0 && c1 <= 0xEF) // 3-byte sequence. + { + if (i + 3 <= val.second) + { + if (i + 3 > cap) + break; + + const uint8_t c2 (val.first[++i]), c3 (val.first[++i]); + + if (c3 >= 0x80 && c3 <= 0xBF) + { + switch (c1) + { + case 0xE0: if (c2 >= 0xA0 && c2 <= 0xBF) continue; break; + case 0xED: if (c2 >= 0x80 && c2 <= 0x9F) continue; break; + default: if (c2 >= 0x80 && c2 <= 0xBF) continue; break; + } + } + } + } + else if (c1 >= 0xF0 && c1 <= 0xF4) // 4-byte sequence. + { + if (i + 4 <= val.second) + { + if (i + 4 > cap) + break; + + const uint8_t c2 (val.first[++i]), + c3 (val.first[++i]), + c4 (val.first[++i]); + + if (c3 >= 0x80 && c3 <= 0xBF && + c4 >= 0x80 && c4 <= 0xBF) + { + switch (c1) + { + case 0xF0: if (c2 >= 0x90 && c2 <= 0xBF) continue; break; + case 0xF4: if (c2 >= 0x80 && c2 <= 0x8F) continue; break; + default: if (c2 >= 0x80 && c2 <= 0xBF) continue; break; + } + } + } + } + + r = {val.first, string::npos}; + + // Update val to point to the beginning of the invalid sequence. + // + val.first += i1; + val.second -= i1; + + return r; + } + } + + if (i != 0) // We have a chunk. + { + r = {val.first, i}; + + val.first += i; + val.second -= i; + } + + return r; + }; + + // Value's original size (used to calculate the offset of the errant + // character in case of a validation failure). + // + const size_t vn (val.second); + + // Write the separator, if any. + // + if (sep.second != 0) + { + if (cap < sep.second && !grow (sep.second)) + goto fail_nospace; + + append (sep.first, sep.second); + } + + // Write the value's opening quote, if requested. + // + if (q != '\0') + { + if (cap == 0 && !grow (1)) + goto fail_nospace; + + append ("\"", 1); + } + + // Write the value, unless empty. + // + while (val.second != 0) + { + pair ch (nullptr, 0); + + if (cap != 0) + ch = check ? chunk_checked () : chunk (); + + if (ch.first == nullptr) + { + // The minimum extra bytes we need the overflow function to be able + // to provide is based on these sequences that we do not break: + // + // - 4 bytes for a UTF-8 sequence + // - 6 bytes for an escaped Unicode sequence (\uXXXX). + // + if (!grow (6, ch.second)) + goto fail_nospace; + } + else if (ch.second != string::npos) + append (ch.first, ch.second); + else + goto fail_utf8; + } + + // Write the value's closing quote, if requested. + // + if (q != '\0') + { + if (cap == 0 && !grow (1)) + goto fail_nospace; + + append ("\"", 1); + } + + return; + + // Note: keep descriptions consistent with the parser. + // + fail_utf8: + throw invalid_json_output (e, + e == event::name ? error_code::invalid_name + : error_code::invalid_value, + "invalid UTF-8 text", + vn - val.second); + + fail_nospace: + throw invalid_json_output ( + e, error_code::buffer_overflow, "insufficient space in buffer"); + } + + size_t buffer_serializer:: + to_chars_impl (char* b, size_t n, const char* f, ...) + { + va_list a; + va_start (a, f); + const int r (vsnprintf (b, n, f, a)); + va_end (a); + + if (r < 0 || r >= static_cast (n)) + { + throw invalid_json_output (event::number, + error_code::invalid_value, + "unable to convert number to string"); + } + + return static_cast (r); + } + } +} -- cgit v1.1