diff options
Diffstat (limited to 'libbutl/json')
-rw-r--r-- | libbutl/json/event.hxx | 27 | ||||
-rw-r--r-- | libbutl/json/parser.cxx | 645 | ||||
-rw-r--r-- | libbutl/json/parser.hxx | 705 | ||||
-rw-r--r-- | libbutl/json/parser.ixx | 552 | ||||
-rw-r--r-- | libbutl/json/pdjson.c | 1044 | ||||
-rw-r--r-- | libbutl/json/pdjson.h | 147 | ||||
-rw-r--r-- | libbutl/json/serializer.cxx | 671 | ||||
-rw-r--r-- | libbutl/json/serializer.hxx | 413 | ||||
-rw-r--r-- | libbutl/json/serializer.ixx | 247 |
9 files changed, 4451 insertions, 0 deletions
diff --git a/libbutl/json/event.hxx b/libbutl/json/event.hxx new file mode 100644 index 0000000..77185cc --- /dev/null +++ b/libbutl/json/event.hxx @@ -0,0 +1,27 @@ +#pragma once + +#include <cstddef> +#include <cstdint> + +namespace butl +{ + namespace json + { + // Parsing/serialization event. + // + enum class event: std::uint8_t + { + begin_object = 1, + end_object, + begin_array, + end_array, + name, + string, + number, + boolean, + null + }; + + constexpr std::size_t event_count = 9; + } +} diff --git a/libbutl/json/parser.cxx b/libbutl/json/parser.cxx new file mode 100644 index 0000000..8ef7422 --- /dev/null +++ b/libbutl/json/parser.cxx @@ -0,0 +1,645 @@ +#define PDJSON_SYMEXPORT static // See below. + +#include <libbutl/json/parser.hxx> + +#include <istream> + +// There is an issue (segfault) with using std::current_exception() and +// std::rethrow_exception() with older versions of libc++ on Linux. While the +// exact root cause hasn't been determined, the suspicion is that something +// gets messed up if we "smuggle" std::exception_ptr through extern "C" call +// frames (we cannot even destroy such an exception without a segfault). We +// also could not determine in which version exactly this has been fixed but +// we know that libc++ 6.0.0 doesn't appear to have this issue (though we are +// not entirely sure the issue is (only) in libc++; libgcc_s could also be +// involved). +// +// The workaround is to just catch (and note) the exception and then throw a +// new instance of generic std::istream::failure. In order not to drag the +// below test into the header, we wrap exception_ptr with optional<> and use +// NULL to indicate the presence of the exception when the workaround is +// required. +// +// Note that if/when we drop this workaround, we should also get rid of +// optional<> in stream::exception member. +// +#undef LIBBUTL_JSON_NO_EXCEPTION_PTR + +#if defined (__linux__) && defined(__clang__) +# if __has_include(<__config>) +# include <__config> // _LIBCPP_VERSION +# if _LIBCPP_VERSION < 6000 +# define LIBBUTL_JSON_NO_EXCEPTION_PTR 1 +# endif +# endif +#endif + +namespace butl +{ + namespace json + { + using namespace std; + + parser:: + ~parser () + { + json_close (impl_); + } + + static int + stream_get (void* x) + { + auto& s (*static_cast<parser::stream*> (x)); + + // In the multi-value mode reading of whitespaces/separators is split + // between our code and pdjson's. As a result, these functions may end + // up being called more than once after EOF is reached. Which is + // something iostream does not handle gracefully. + // + if (!s.is->eof ()) + { + try + { + // We first peek not to trip failbit on EOF. + // + if (s.is->peek () != istream::traits_type::eof ()) + return static_cast<char> (s.is->get ()); + } + catch (...) + { +#ifndef LIBBUTL_JSON_NO_EXCEPTION_PTR + s.exception = current_exception (); +#else + s.exception = nullptr; +#endif + } + } + + return EOF; + } + + static int + stream_peek (void* x) + { + auto& s (*static_cast<parser::stream*> (x)); + + if (!s.is->eof ()) + { + try + { + auto c (s.is->peek ()); + if (c != istream::traits_type::eof ()) + return static_cast<char> (c); + } + catch (...) + { +#ifndef LIBBUTL_JSON_NO_EXCEPTION_PTR + s.exception = current_exception (); +#else + s.exception = nullptr; +#endif + } + } + + return EOF; + } + + // NOTE: watch out for exception safety (specifically, doing anything that + // might throw after opening the stream). + // + parser:: + parser (istream& is, const char* n, bool mv, const char* sep) noexcept + : input_name (n), + stream_ {&is, nullopt}, + multi_value_ (mv), + separators_ (sep), + raw_s_ (nullptr), + raw_n_ (0) + { + json_open_user (impl_, &stream_get, &stream_peek, &stream_); + json_set_streaming (impl_, multi_value_); + } + + parser:: + parser (const void* t, + size_t s, + const char* n, + bool mv, + const char* sep) noexcept + : input_name (n), + stream_ {nullptr, nullopt}, + multi_value_ (mv), + separators_ (sep), + raw_s_ (nullptr), + raw_n_ (0) + { + json_open_buffer (impl_, t, s); + json_set_streaming (impl_, multi_value_); + } + + optional<event> parser:: + next () + { + name_p_ = value_p_ = location_p_ = false; + + // Note that for now we don't worry about the state of the parser if + // next_impl() throws assuming it is not going to be reused. + // + if (peeked_) + { + parsed_ = peeked_; + peeked_ = nullopt; + } + else + parsed_ = next_impl (); + + return translate (*parsed_); + } + + optional<event> parser:: + peek () + { + if (!peeked_) + { + if (parsed_) + { + cache_parsed_data (); + cache_parsed_location (); + } + peeked_ = next_impl (); + } + return translate (*peeked_); + } + + static inline const char* + event_name (event e) + { + switch (e) + { + case event::begin_object: return "beginning of object"; + case event::end_object: return "end of object"; + case event::begin_array: return "beginning of array"; + case event::end_array: return "end of array"; + case event::name: return "member name"; + case event::string: return "string value"; + case event::number: return "numeric value"; + case event::boolean: return "boolean value"; + case event::null: return "null value"; + } + + return ""; + } + + bool parser:: + next_expect (event p, optional<event> s) + { + optional<event> e (next ()); + bool r; + if (e && ((r = *e == p) || (s && *e == *s))) + return r; + + string d ("expected "); + d += event_name (p); + + if (s) + { + d += " or "; + d += event_name (*s); + } + + if (e) + { + d += " instead of "; + d += event_name (*e); + } + + throw invalid_json_input (input_name != nullptr ? input_name : "", + line (), + column (), + position (), + move (d)); + } + + void parser:: + next_expect_name (const char* n, bool su) + { + for (;;) + { + next_expect (event::name); + + if (name () == n) + return; + + if (!su) + break; + + next_expect_value_skip (); + } + + string d ("expected object member name '"); + d += n; + d += "' instead of '"; + d += name (); + d += '\''; + + throw invalid_json_input (input_name != nullptr ? input_name : "", + line (), + column (), + position (), + move (d)); + } + + void parser:: + next_expect_value_skip () + { + optional<event> e (next ()); + + if (e) + { + switch (*e) + { + case event::begin_object: + case event::begin_array: + { + // Skip until matching end_object/array keeping track of nesting. + // We are going to rely on the fact that we should either get such + // an event or next() should throw. + // + event be (*e); + event ee (be == event::begin_object + ? event::end_object + : event::end_array); + + for (size_t n (0);; ) + { + event e (*next ()); + + if (e == ee) + { + if (n == 0) + break; + + --n; + } + else if (e == be) + ++n; + } + + return; + } + case event::string: + case event::number: + case event::boolean: + case event::null: + return; + case event::name: + case event::end_object: + case event::end_array: + break; + } + } + + string d ("expected value"); + + if (e) + { + d += " instead of "; + d += event_name (*e); + } + + throw invalid_json_input (input_name != nullptr ? input_name : "", + line (), + column (), + position (), + move (d)); + } + + std::uint64_t parser:: + line () const noexcept + { + if (!location_p_) + { + if (!parsed_) + return 0; + + assert (!peeked_); + + return static_cast<uint64_t> ( + json_get_lineno (const_cast<json_stream*> (impl_))); + } + + return line_; + } + + std::uint64_t parser:: + column () const noexcept + { + if (!location_p_) + { + if (!parsed_) + return 0; + + assert (!peeked_); + + return static_cast<uint64_t> ( + json_get_column (const_cast<json_stream*> (impl_))); + } + + return column_; + } + + std::uint64_t parser:: + position () const noexcept + { + if (!location_p_) + { + if (!parsed_) + return 0; + + assert (!peeked_); + + return static_cast<uint64_t> ( + json_get_position (const_cast<json_stream*> (impl_))); + } + + return position_; + } + + json_type parser:: + next_impl () + { + raw_s_ = nullptr; + raw_n_ = 0; + json_type e; + + // Read characters between values skipping required separators and JSON + // whitespaces. Return whether a required separator was encountered as + // well as the first non-separator/whitespace character (which, if EOF, + // should trigger a check for input/output errors). + // + // Note that the returned non-separator will not have been extracted + // from the input (so position, column, etc. will still refer to its + // predecessor). + // + auto skip_separators = [this] () -> pair<bool, int> + { + bool r (separators_ == nullptr); + + int c; + for (; (c = json_source_peek (impl_)) != EOF; json_source_get (impl_)) + { + // User separator. + // + if (separators_ != nullptr && *separators_ != '\0') + { + if (strchr (separators_, c) != nullptr) + { + r = true; + continue; + } + } + + // JSON separator. + // + if (json_isspace (c)) + { + if (separators_ != nullptr && *separators_ == '\0') + r = true; + + continue; + } + + break; + } + + return make_pair (r, c); + }; + + // In the multi-value mode skip any instances of required separators + // (and any other JSON whitespace) preceding the first JSON value. + // + if (multi_value_ && !parsed_ && !peeked_) + { + if (skip_separators ().second == EOF && stream_.is != nullptr) + { + if (stream_.exception) goto fail_rethrow; + if (stream_.is->fail ()) goto fail_stream; + } + } + + e = json_next (impl_); + + // First check for a pending input/output error. + // + if (stream_.is != nullptr) + { + if (stream_.exception) goto fail_rethrow; + if (stream_.is->fail ()) goto fail_stream; + } + + // There are two ways to view separation between two values: as following + // the first value or as preceding the second value. And one aspect that + // is determined by this is whether a separation violation is a problem + // with the first value or with the second, which becomes important if + // the user bails out before parsing the second value. + // + // Consider these two unseparated value (yes, in JSON they are two + // values, leading zeros are not allowed in JSON numbers): + // + // 01 + // + // If the user bails out after parsing 0 in a stream that should have + // been newline-delimited, they most likely would want to get an error + // since this is most definitely an invalid value rather than two + // values that are not properly separated. So in this light we handle + // separators at the end of the first value. + // + switch (e) + { + case JSON_DONE: + { + // Deal with the following value separators. + // + // Note that we must not do this for the second JSON_DONE (or the + // first one in case there are no values) that signals the end of + // input. + // + if (multi_value_ && + (parsed_ || peeked_) && + (peeked_ ? *peeked_ : *parsed_) != JSON_DONE) + { + auto p (skip_separators ()); + + if (p.second == EOF && stream_.is != nullptr) + { + if (stream_.exception) goto fail_rethrow; + if (stream_.is->fail ()) goto fail_stream; + } + + // Note that we don't require separators after the last value. + // + if (!p.first && p.second != EOF) + { + json_source_get (impl_); // Consume to update column number. + goto fail_separation; + } + + json_reset (impl_); + } + break; + } + case JSON_ERROR: goto fail_json; + case JSON_STRING: + case JSON_NUMBER: + raw_s_ = json_get_string (impl_, &raw_n_); + raw_n_--; // Includes terminating `\0`. + break; + case JSON_TRUE: raw_s_ = "true"; raw_n_ = 4; break; + case JSON_FALSE: raw_s_ = "false"; raw_n_ = 5; break; + case JSON_NULL: raw_s_ = "null"; raw_n_ = 4; break; + default: break; + } + + return e; + + fail_json: + throw invalid_json_input ( + input_name != nullptr ? input_name : "", + static_cast<uint64_t> (json_get_lineno (impl_)), + static_cast<uint64_t> (json_get_column (impl_)), + static_cast<uint64_t> (json_get_position (impl_)), + json_get_error (impl_)); + + fail_separation: + throw invalid_json_input ( + input_name != nullptr ? input_name : "", + static_cast<uint64_t> (json_get_lineno (impl_)), + static_cast<uint64_t> (json_get_column (impl_)), + static_cast<uint64_t> (json_get_position (impl_)), + "missing separator between JSON values"); + + fail_stream: + throw invalid_json_input ( + input_name != nullptr ? input_name : "", + static_cast<uint64_t> (json_get_lineno (impl_)), + static_cast<uint64_t> (json_get_column (impl_)), + static_cast<uint64_t> (json_get_position (impl_)), + "unable to read JSON input text"); + + fail_rethrow: +#ifndef LIBBUTL_JSON_NO_EXCEPTION_PTR + rethrow_exception (move (*stream_.exception)); +#else + throw istream::failure ("unable to read"); +#endif + } + + optional<event> parser:: + translate (json_type e) const noexcept + { + switch (e) + { + case JSON_DONE: return nullopt; + case JSON_OBJECT: return event::begin_object; + case JSON_OBJECT_END: return event::end_object; + case JSON_ARRAY: return event::begin_array; + case JSON_ARRAY_END: return event::end_array; + case JSON_STRING: + { + // This can be a value or, inside an object, a name from the + // name/value pair. + // + size_t n; + return json_get_context (const_cast<json_stream*> (impl_), &n) == + JSON_OBJECT && + n % 2 == 1 + ? event::name + : event::string; + } + case JSON_NUMBER: return event::number; + case JSON_TRUE: return event::boolean; + case JSON_FALSE: return event::boolean; + case JSON_NULL: return event::null; + case JSON_ERROR: assert (false); // Should've been handled by caller. + } + + return nullopt; // Should never reach. + } + + void parser:: + cache_parsed_data () + { + name_p_ = value_p_ = false; + if (const optional<event> e = translate (*parsed_)) + { + if (e == event::name) + { + name_.assign (raw_s_, raw_n_); + name_p_ = true; + } + else if (value_event (e)) + { + value_.assign (raw_s_, raw_n_); + value_p_ = true; + } + } + } + + void parser:: + cache_parsed_location () noexcept + { + line_ = static_cast<uint64_t> (json_get_lineno (impl_)); + column_ = static_cast<uint64_t> (json_get_column (impl_)); + position_ = static_cast<uint64_t> (json_get_position (impl_)); + location_p_ = true; + } + + bool parser:: + value_event (optional<event> e) noexcept + { + if (!e) + return false; + + switch (*e) + { + case event::string: + case event::number: + case event::boolean: + case event::null: + return true; + default: + return false; + } + } + + [[noreturn]] void parser:: + throw_invalid_value (const char* type, const char* v, size_t n) const + { + string d (string ("invalid ") + type + " value: '"); + d.append (v, n); + d += '\''; + + throw invalid_json_input (input_name != nullptr ? input_name : "", + line (), + column (), + position (), + move (d)); + } + } // namespace json +} // namespace butl + +// Include the implementation into our translation unit (instead of compiling +// it separately) to (hopefully) get function inlining without LTO. +// +// Let's keep it last since the implementation defines a couple of macros. +// +#if defined(__clang__) || defined(__GNUC__) +# pragma GCC diagnostic ignored "-Wunused-function" +#endif + +extern "C" +{ +#define PDJSON_STACK_INC 16 +#define PDJSON_STACK_MAX 2048 +#include "pdjson.c" +} diff --git a/libbutl/json/parser.hxx b/libbutl/json/parser.hxx new file mode 100644 index 0000000..95d9c4e --- /dev/null +++ b/libbutl/json/parser.hxx @@ -0,0 +1,705 @@ +#pragma once + +#ifdef BUILD2_BOOTSTRAP +# error JSON parser not available during bootstrap +#endif + +#include <iosfwd> +#include <string> +#include <cstddef> // size_t +#include <cstdint> // uint64_t +#include <utility> // pair +#include <exception> // exception_ptr +#include <stdexcept> // invalid_argument + +#include <libbutl/optional.hxx> // butl::optional is std::optional or similar. + +#include <libbutl/json/event.hxx> + +#include <libbutl/json/pdjson.h> // Implementation details. + +#include <libbutl/export.hxx> + +namespace butl +{ + // Using the RFC8259 terminology: JSON (input) text, JSON value, object + // member. + // + namespace json + { + class invalid_json_input: public std::invalid_argument + { + public: + std::string name; + std::uint64_t line; + std::uint64_t column; + std::uint64_t position; + + invalid_json_input (std::string name, + std::uint64_t line, + std::uint64_t column, + std::uint64_t position, + const std::string& description); + + invalid_json_input (std::string name, + std::uint64_t line, + std::uint64_t column, + std::uint64_t position, + const char* description); + }; + + class LIBBUTL_SYMEXPORT parser + { + public: + const char* input_name; + + // Construction. + // + + // Parse JSON input text from std::istream. + // + // The name argument is used to identify the input being parsed. Note + // that the stream, name, and separators are kept as references so they + // must outlive the parser instance. + // + // If stream exceptions are enabled then the std::ios_base::failure + // exception is used to report input/output errors (badbit and failbit). + // Otherwise, those are reported as the invalid_json_input exception. + // + // If multi_value is true, enable the multi-value mode in which case the + // input stream may contain multiple JSON values (more precisely, zero + // or more). If false (the default), parsing will fail unless there is + // exactly one JSON value in the input stream. + // + // If multi_value is true, the separators argument specifies the + // required separator characters between JSON values. At least one of + // them must be present between every pair of JSON values (in addition + // to any number of JSON whitespaces). No separators are required after + // the last JSON value (but any found will be skipped). + // + // Specifically, if it is NULL, then no separation is required (that is, + // both `{...}{...}` and `{...} {...}` would be valid). If it is empty, + // then at least one JSON whitespace is required. And if it is non- + // empty, then at least one of its characters must be present (for + // example, "\n\t" would require at least one newline or TAB character + // between JSON values). + // + // Note that a separator need not be valid JSON whitespace: any + // character is acceptable (though it probably shouldn't be an object, + // array, or string delimiter and should not occur within a non-self- + // delimited top-level value, such as `true`, `false`, `null`, or a + // number). All instances of required separators before and after a + // value are skipped. Therefore JSON Text Sequences (RFC 7464; AKA + // Record Separator-delimited JSON), which requires the RS (0x1E) + // character before each value, can be handled as well. + // + parser (std::istream&, + const std::string& name, + bool multi_value = false, + const char* separators = nullptr) noexcept; + + parser (std::istream&, + const char* name, + bool multi_value = false, + const char* separators = nullptr) noexcept; + + parser (std::istream&, + std::string&&, + bool = false, + const char* = nullptr) = delete; + + // Parse a memory buffer that contains the entire JSON input text. + // + // The name argument is used to identify the input being parsed. Note + // that the buffer, name, and separators are kept as references so they + // must outlive the parser instance. + // + parser (const void* text, + std::size_t size, + const std::string& name, + bool multi_value = false, + const char* separators = nullptr) noexcept; + + parser (const void* text, + std::size_t size, + const char* name, + bool multi_value = false, + const char* separators = nullptr) noexcept; + + parser (const void*, + std::size_t, + std::string&&, + bool = false, + const char* = nullptr) = delete; + + // Similar to the above but parse a string. + // + parser (const std::string& text, + const std::string& name, + bool multi_value = false, + const char* separators = nullptr) noexcept; + + parser (const std::string& text, + const char* name, + bool multi_value = false, + const char* separators = nullptr) noexcept; + + parser (const std::string&, + std::string&&, + bool = false, + const char* = nullptr) = delete; + + // Similar to the above but parse a C-string. + // + parser (const char* text, + const std::string& name, + bool multi_value = false, + const char* separators = nullptr) noexcept; + + parser (const char* text, + const char* name, + bool multi_value = false, + const char* separators = nullptr) noexcept; + + parser (const char*, + std::string&&, + bool = false, + const char* = nullptr) = delete; + + parser (parser&&) = delete; + parser (const parser&) = delete; + + parser& operator= (parser&&) = delete; + parser& operator= (const parser&) = delete; + + // Event iteration. + // + + // Return the next event or nullopt if end of input is reached. + // + // In the single-value parsing mode (default) the parsing code could + // look like this: + // + // while (optional<event> e = p.next ()) + // { + // switch (*e) + // { + // // ... + // } + // } + // + // In the multi-value mode the parser additionally returns nullopt after + // every JSON value parsed (so there will be two nullopt's after the + // last JSON value, the second indicating the end of input). + // + // One way to perform multi-value parsing is with the help of the peek() + // function (see below): + // + // while (p.peek ()) + // { + // while (optional<event> e = p.next ()) + // { + // switch (*e) + // { + // //... + // } + // } + // } + // + // Note that while the single-value mode will always parse exactly one + // value, the multi-value mode will accept zero values in which case a + // single nullopt is returned. + // + optional<event> + next (); + + // The range-based for loop support. + // + // In the single-value parsing mode (default) the parsing code could + // look like this: + // + // for (event e: p) + // { + // switch (e) + // { + // //... + // } + // } + // + // And in the multi-value mode (see next() for more information) like + // this: + // + // while (p.peek ()) + // { + // for (event e: p) + // { + // switch (e) + // { + // //... + // } + // } + // } + // + // Note that generally, the iterator interface doesn't make much sense + // for the parser so for now we have an implementation that is just + // enough for the range-based for. + // + struct iterator; + + iterator begin () {return iterator (this, next ());} + iterator end () {return iterator (nullptr, nullopt);} + + // Return the next event without considering it parsed. In other words, + // after this call, any subsequent calls to peek() and the next call to + // next() (if any) will all return the same event. + // + // Note that the name, value, and line corresponding to the peeked event + // are not accessible with name(), value() and line(); these functions + // will still return values corresponding to the most recent call to + // next(). The peeked values, however, can be accessed in the raw form + // using data(). + // + optional<event> + peek (); + + + // Event data access. + // + + // Return the object member name. + // + const std::string& + name (); + + // Any value (string, number, boolean, and null) can be retrieved as a + // string. Calling this function after any non-value events is illegal. + // + // Note that the value is returned as a non-const string reference and + // you are allowed to move the value out of it. However, this should not + // be done unnecessarily or in cases where the small string optimization + // is likely since the string's buffer is reused to store subsequent + // values. + // + std::string& + value (); + + // Convert the value to an integer, floating point, or bool. Throw + // invalid_json_input if the conversion is impossible without a loss. + // + template <typename T> + T + value () const; + + // Return the value or object member name in the raw form. + // + // Calling this function on non-value/name events is legal in which case + // NULL is returned. Note also that the returned data corresponds to the + // most recent event, whether peeked or parsed. + // + std::pair<const char*, std::size_t> + data () const {return std::make_pair (raw_s_, raw_n_);} + + + // Higher-level API suitable for parsing specific JSON vocabularies. + // + // The API summary: + // + // void next_expect (event); + // bool next_expect (event primary, event secondary); + // + // void next_expect_name (string name, bool skip_unknown = false); + // + // std::string& next_expect_string (); + // T next_expect_string<T> (); + // std::string& next_expect_number (); + // T next_expect_number<T> (); + // std::string& next_expect_boolean (); + // T next_expect_boolean<T>(); + // + // std::string* next_expect_string_null (); + // optional<T> next_expect_string_null<T> (); + // std::string* next_expect_number_null (); + // optional<T> next_expect_number_null<T> (); + // std::string* next_expect_boolean_null (); + // optional<T> next_expect_boolean_null<T>(); + // + // std::string& next_expect_member_string (string name, bool = false); + // T next_expect_member_string<T> (string name, bool = false); + // std::string& next_expect_member_number (string name, bool = false); + // T next_expect_member_number<T> (string name, bool = false); + // std::string& next_expect_member_boolean (string name, bool = false); + // T next_expect_member_boolean<T>(string name, bool = false); + // + // std::string* next_expect_member_string_null (string, bool = false); + // optional<T> next_expect_member_string_null<T> (string, bool = false); + // std::string* next_expect_member_number_null (string, bool = false); + // optional<T> next_expect_member_number_null<T> (string, bool = false); + // std::string* next_expect_member_boolean_null (string, bool = false); + // optional<T> next_expect_member_boolean_null<T>(string, bool = false); + // + // void next_expect_member_object (string name, bool = false); + // bool next_expect_member_object_null(string name, bool = false); + // + // void next_expect_member_array (string name, bool = false); + // bool next_expect_member_array_null(string name, bool = false); + // + // void next_expect_value_skip(); + + // Get the next event and make sure that it's what's expected: primary + // or, if specified, secondary event. If it is not either, then throw + // invalid_json_input with appropriate description. Return true if it is + // primary. + // + // The secondary expected event is primarily useful for handling + // optional members. For example: + // + // while (p.next_expect (event::name, event::end_object)) + // { + // // Handle object member. + // } + // + // Or homogeneous arrays: + // + // while (p.next_expect (event::string, event::end_array)) + // { + // // Handle array element. + // } + // + // Or values that can be null: + // + // if (p.next_expect (event::begin_object, event::null)) + // { + // // Parse object. + // } + // + bool + next_expect (event primary, optional<event> secondary = nullopt); + + // Get the next event and make sure it is event::name and the object + // member matches the specified name. If either is not, then throw + // invalid_json_input with appropriate description. If skip_unknown is + // true, then skip over unknown member names until a match is found. + // + void + next_expect_name (const char* name, bool skip_unknown = false); + + void + next_expect_name (const std::string&, bool = false); + + // Get the next event and make sure it is event::<type> returning its + // value similar to the value() functions. If it is not, then throw + // invalid_json_input with appropriate description. + // + std::string& + next_expect_string (); + + template <typename T> + T + next_expect_string (); + + std::string& + next_expect_number (); + + template <typename T> + T + next_expect_number (); + + std::string& + next_expect_boolean (); + + template <typename T> + T + next_expect_boolean (); + + // Similar to next_expect_<type>() but in addition to event::<type> also + // allow event::null, in which case returning no value. + // + std::string* + next_expect_string_null (); + + template <typename T> + optional<T> + next_expect_string_null (); + + std::string* + next_expect_number_null (); + + template <typename T> + optional<T> + next_expect_number_null (); + + std::string* + next_expect_boolean_null (); + + template <typename T> + optional<T> + next_expect_boolean_null (); + + // Call next_expect_name() followed by next_expect_<type>[_null]() + // returning its result. In other words, parse the entire object member + // with the specifed name and of type <type>, returning its value. + + // next_expect_member_string() + // + std::string& + next_expect_member_string (const char* name, bool skip_unknown = false); + + std::string& + next_expect_member_string (const std::string&, bool = false); + + template <typename T> + T + next_expect_member_string (const char*, bool = false); + + template <typename T> + T + next_expect_member_string (const std::string&, bool = false); + + // next_expect_member_number() + // + std::string& + next_expect_member_number (const char* name, bool skip_unknown = false); + + std::string& + next_expect_member_number (const std::string&, bool = false); + + template <typename T> + T + next_expect_member_number (const char*, bool = false); + + template <typename T> + T + next_expect_member_number (const std::string&, bool = false); + + // next_expect_member_boolean() + // + std::string& + next_expect_member_boolean (const char* name, bool skip_unknown = false); + + std::string& + next_expect_member_boolean (const std::string&, bool = false); + + template <typename T> + T + next_expect_member_boolean (const char*, bool = false); + + template <typename T> + T + next_expect_member_boolean (const std::string&, bool = false); + + // next_expect_member_string_null() + // + std::string* + next_expect_member_string_null (const char*, bool = false); + + std::string* + next_expect_member_string_null (const std::string&, bool = false); + + template <typename T> + optional<T> + next_expect_member_string_null (const char*, bool = false); + + template <typename T> + optional<T> + next_expect_member_string_null (const std::string&, bool = false); + + // next_expect_member_number_null() + // + std::string* + next_expect_member_number_null (const char*, bool = false); + + std::string* + next_expect_member_number_null (const std::string&, bool = false); + + template <typename T> + optional<T> + next_expect_member_number_null (const char*, bool = false); + + template <typename T> + optional<T> + next_expect_member_number_null (const std::string&, bool = false); + + // next_expect_member_boolean_null() + // + std::string* + next_expect_member_boolean_null (const char*, bool = false); + + std::string* + next_expect_member_boolean_null (const std::string&, bool = false); + + template <typename T> + optional<T> + next_expect_member_boolean_null (const char*, bool = false); + + template <typename T> + optional<T> + next_expect_member_boolean_null (const std::string&, bool = false); + + // Call next_expect_name() followed by next_expect(event::begin_object). + // In the _null version also allow event::null, in which case return + // false. + // + void + next_expect_member_object (const char* name, bool skip_unknown = false); + + void + next_expect_member_object (const std::string&, bool = false); + + bool + next_expect_member_object_null (const char*, bool = false); + + bool + next_expect_member_object_null (const std::string&, bool = false); + + // Call next_expect_name() followed by next_expect(event::begin_array). + // In the _null version also allow event::null, in which case return + // false. + // + void + next_expect_member_array (const char* name, bool skip_unknown = false); + + void + next_expect_member_array (const std::string&, bool = false); + + bool + next_expect_member_array_null (const char*, bool = false); + + bool + next_expect_member_array_null (const std::string&, bool = false); + + // Get the next event and make sure it is the beginning of a value + // (begin_object, begin_array, string, number, boolean, null). If it is + // not, then throw invalid_json_input with appropriate description. + // Otherwise, skip until the end of the value, recursively in case of + // object and array. + // + // This function is primarily useful for skipping unknown object + // members, for example: + // + // while (p.next_expect (event::name, event::end_object)) + // { + // if (p.name () == "known") + // { + // // Handle known member. + // } + // else + // p.next_expect_value_skip (); + // } + // + void + next_expect_value_skip (); + + // Parsing location. + // + + // Return the line number (1-based) corresponding to the most recently + // parsed event or 0 if nothing has been parsed yet. + // + std::uint64_t + line () const noexcept; + + // Return the column number (1-based) corresponding to the beginning of + // the most recently parsed event or 0 if nothing has been parsed yet. + // + std::uint64_t + column () const noexcept; + + // Return the position (byte offset) pointing immediately after the most + // recently parsed event or 0 if nothing has been parsed yet. + // + std::uint64_t + position () const noexcept; + + // Implementation details. + // + public: + struct iterator + { + using value_type = event; + + explicit + iterator (parser* p = nullptr, optional<event> e = nullopt) + : p_ (p), e_ (e) {} + + event operator* () const {return *e_;} + iterator& operator++ () {e_ = p_->next (); return *this;} + + // Comparison only makes sense when comparing to end (eof). + // + bool operator== (iterator y) const {return !e_ && !y.e_;} + bool operator!= (iterator y) const {return !(*this == y);} + + private: + parser* p_; + optional<event> e_; + }; + + struct stream + { + std::istream* is; + optional<std::exception_ptr> exception; + }; + + [[noreturn]] void + throw_invalid_value (const char* type, const char*, std::size_t) const; + + ~parser (); + + private: + // Functionality shared by next() and peek(). + // + json_type + next_impl (); + + // Translate the event produced by the most recent call to next_impl(). + // + // Note that the underlying parser state determines whether name or + // value is returned when translating JSON_STRING. + // + optional<event> + translate (json_type) const noexcept; + + // Cache state (name/value) produced by the most recent call to + // next_impl(). + // + void + cache_parsed_data (); + + // Cache the location numbers as determined by the most recent call to + // next_impl(). + // + void + cache_parsed_location () noexcept; + + // Return true if this is a value event (string, number, boolean, or + // null). + // + static bool + value_event (optional<event>) noexcept; + + stream stream_; + + bool multi_value_; + const char* separators_; + + // The *_p_ members indicate whether the value is present (cached). + // Note: not using optional not to reallocate the string's buffer. + // + std::string name_; bool name_p_ = false; + std::string value_; bool value_p_ = false; + std::uint64_t line_, column_, position_; bool location_p_ = false; + + optional<json_type> parsed_; // Current parsed event if any. + optional<json_type> peeked_; // Current peeked event if any. + + ::json_stream impl_[1]; + + // Cached raw value. + // + const char* raw_s_; + std::size_t raw_n_; + }; + } +} + +#include <libbutl/json/parser.ixx> diff --git a/libbutl/json/parser.ixx b/libbutl/json/parser.ixx new file mode 100644 index 0000000..cf6dca3 --- /dev/null +++ b/libbutl/json/parser.ixx @@ -0,0 +1,552 @@ +#include <cerrno> +#include <limits> // numeric_limits +#include <utility> // move() +#include <cassert> +#include <cstdlib> // strto*() +#include <type_traits> // enable_if, is_* +#include <cstring> // strlen() + +namespace butl +{ + namespace json + { + inline invalid_json_input:: + invalid_json_input (std::string n, + std::uint64_t l, + std::uint64_t c, + std::uint64_t p, + const std::string& d) + : invalid_json_input (move (n), l, c, p, d.c_str ()) + { + } + + inline invalid_json_input:: + invalid_json_input (std::string n, + std::uint64_t l, + std::uint64_t c, + std::uint64_t p, + const char* d) + : invalid_argument (d), + name (std::move (n)), + line (l), column (c), position (p) + { + } + + inline parser:: + parser (std::istream& is, + const std::string& n, + bool mv, + const char* sep) noexcept + : parser (is, n.c_str (), mv, sep) + { + } + + inline parser:: + parser (const void* t, + std::size_t s, + const std::string& n, + bool mv, + const char* sep) noexcept + : parser (t, s, n.c_str (), mv, sep) + { + } + + inline parser:: + parser (const std::string& t, + const std::string& n, + bool mv, + const char* sep) noexcept + : parser (t.data (), t.size (), n.c_str (), mv, sep) + { + } + + inline parser:: + parser (const std::string& t, + const char* n, + bool mv, + const char* sep) noexcept + : parser (t.data (), t.size (), n, mv, sep) + { + } + + inline parser:: + parser (const char* t, + const std::string& n, + bool mv, + const char* sep) noexcept + : parser (t, std::strlen (t), n.c_str (), mv, sep) + { + } + + inline parser:: + parser (const char* t, + const char* n, + bool mv, + const char* sep) noexcept + : parser (t, std::strlen (t), n, mv, sep) + { + } + + inline const std::string& parser:: + name () + { + if (!name_p_) + { + assert (parsed_ && !peeked_ && !value_p_); + cache_parsed_data (); + assert (name_p_); + } + return name_; + } + + inline std::string& parser:: + value () + { + if (!value_p_) + { + assert (parsed_ && !peeked_ && !name_p_); + cache_parsed_data (); + assert (value_p_); + } + return value_; + } + + // Note: one day we will be able to use C++17 from_chars() which was made + // exactly for this. + // + template <typename T> + inline typename std::enable_if<std::is_same<T, bool>::value, T>::type + parse_value (const char* b, size_t, const parser&) + { + return *b == 't'; + } + + template <typename T> + inline typename std::enable_if< + std::is_integral<T>::value && + std::is_signed<T>::value && + !std::is_same<T, bool>::value, T>::type + parse_value (const char* b, size_t n, const parser& p) + { + char* e (nullptr); + errno = 0; // We must clear it according to POSIX. + std::int64_t v (strtoll (b, &e, 10)); // Can't throw. + + if (e == b || e != b + n || errno == ERANGE || + v < std::numeric_limits<T>::min () || + v > std::numeric_limits<T>::max ()) + p.throw_invalid_value ("signed integer", b, n); + + return static_cast<T> (v); + } + + template <typename T> + inline typename std::enable_if< + std::is_integral<T>::value && + std::is_unsigned<T>::value && + !std::is_same<T, bool>::value, T>::type + parse_value (const char* b, size_t n, const parser& p) + { + char* e (nullptr); + errno = 0; // We must clear it according to POSIX. + std::uint64_t v (strtoull (b, &e, 10)); // Can't throw. + + if (e == b || e != b + n || errno == ERANGE || + v > std::numeric_limits<T>::max ()) + p.throw_invalid_value ("unsigned integer", b, n); + + return static_cast<T> (v); + } + + template <typename T> + inline typename std::enable_if<std::is_same<T, float>::value, T>::type + parse_value (const char* b, size_t n, const parser& p) + { + char* e (nullptr); + errno = 0; // We must clear it according to POSIX. + T r (std::strtof (b, &e)); + + if (e == b || e != b + n || errno == ERANGE) + p.throw_invalid_value ("float", b, n); + + return r; + } + + template <typename T> + inline typename std::enable_if<std::is_same<T, double>::value, T>::type + parse_value (const char* b, size_t n, const parser& p) + { + char* e (nullptr); + errno = 0; // We must clear it according to POSIX. + T r (std::strtod (b, &e)); + + if (e == b || e != b + n || errno == ERANGE) + p.throw_invalid_value ("double", b, n); + + return r; + } + + template <typename T> + inline typename std::enable_if<std::is_same<T, long double>::value, T>::type + parse_value (const char* b, size_t n, const parser& p) + { + char* e (nullptr); + errno = 0; // We must clear it according to POSIX. + T r (std::strtold (b, &e)); + + if (e == b || e != b + n || errno == ERANGE) + p.throw_invalid_value ("long double", b, n); + + return r; + } + + template <typename T> + inline T parser:: + value () const + { + if (!value_p_) + { + assert (parsed_ && !peeked_ && value_event (translate (*parsed_))); + return parse_value<T> (raw_s_, raw_n_, *this); + } + + return parse_value<T> (value_.data (), value_.size (), *this); + } + + inline void parser:: + next_expect_name (const std::string& n, bool su) + { + next_expect_name (n.c_str (), su); + } + + // next_expect_<type>() + // + inline std::string& parser:: + next_expect_string () + { + next_expect (event::string); + return value (); + } + + template <typename T> + inline T parser:: + next_expect_string () + { + next_expect (event::string); + return value<T> (); + } + + inline std::string& parser:: + next_expect_number () + { + next_expect (event::number); + return value (); + } + + template <typename T> + inline T parser:: + next_expect_number () + { + next_expect (event::number); + return value<T> (); + } + + inline std::string& parser:: + next_expect_boolean () + { + next_expect (event::boolean); + return value (); + } + + template <typename T> + inline T parser:: + next_expect_boolean () + { + next_expect (event::boolean); + return value<T> (); + } + + // next_expect_<type>_null() + // + inline std::string* parser:: + next_expect_string_null () + { + return next_expect (event::string, event::null) ? &value () : nullptr; + } + + template <typename T> + inline optional<T> parser:: + next_expect_string_null () + { + return next_expect (event::string, event::null) + ? optional<T> (value<T> ()) + : nullopt; + } + + inline std::string* parser:: + next_expect_number_null () + { + return next_expect (event::number, event::null) ? &value () : nullptr; + } + + template <typename T> + inline optional<T> parser:: + next_expect_number_null () + { + return next_expect (event::number, event::null) + ? optional<T> (value<T> ()) + : nullopt; + } + + inline std::string* parser:: + next_expect_boolean_null () + { + return next_expect (event::boolean, event::null) ? &value () : nullptr; + } + + template <typename T> + inline optional<T> parser:: + next_expect_boolean_null () + { + return next_expect (event::boolean, event::null) + ? optional<T> (value<T> ()) + : nullopt; + } + + // next_expect_member_string() + // + inline std::string& parser:: + next_expect_member_string (const char* n, bool su) + { + next_expect_name (n, su); + return next_expect_string (); + } + + inline std::string& parser:: + next_expect_member_string (const std::string& n, bool su) + { + return next_expect_member_string (n.c_str (), su); + } + + template <typename T> + inline T parser:: + next_expect_member_string (const char* n, bool su) + { + next_expect_name (n, su); + return next_expect_string<T> (); + } + + template <typename T> + inline T parser:: + next_expect_member_string (const std::string& n, bool su) + { + return next_expect_member_string<T> (n.c_str (), su); + } + + // next_expect_member_number() + // + inline std::string& parser:: + next_expect_member_number (const char* n, bool su) + { + next_expect_name (n, su); + return next_expect_number (); + } + + inline std::string& parser:: + next_expect_member_number (const std::string& n, bool su) + { + return next_expect_member_number (n.c_str (), su); + } + + template <typename T> + inline T parser:: + next_expect_member_number (const char* n, bool su) + { + next_expect_name (n, su); + return next_expect_number<T> (); + } + + template <typename T> + inline T parser:: + next_expect_member_number (const std::string& n, bool su) + { + return next_expect_member_number<T> (n.c_str (), su); + } + + // next_expect_member_boolean() + // + inline std::string& parser:: + next_expect_member_boolean (const char* n, bool su) + { + next_expect_name (n, su); + return next_expect_boolean (); + } + + inline std::string& parser:: + next_expect_member_boolean (const std::string& n, bool su) + { + return next_expect_member_boolean (n.c_str (), su); + } + + template <typename T> + inline T parser:: + next_expect_member_boolean (const char* n, bool su) + { + next_expect_name (n, su); + return next_expect_boolean<T> (); + } + + template <typename T> + inline T parser:: + next_expect_member_boolean (const std::string& n, bool su) + { + return next_expect_member_boolean<T> (n.c_str (), su); + } + + // next_expect_member_string_null() + // + inline std::string* parser:: + next_expect_member_string_null (const char* n, bool su) + { + next_expect_name (n, su); + return next_expect_string_null (); + } + + inline std::string* parser:: + next_expect_member_string_null (const std::string& n, bool su) + { + return next_expect_member_string_null (n.c_str (), su); + } + + template <typename T> + inline optional<T> parser:: + next_expect_member_string_null (const char* n, bool su) + { + next_expect_name (n, su); + return next_expect_string_null<T> (); + } + + template <typename T> + inline optional<T> parser:: + next_expect_member_string_null (const std::string& n, bool su) + { + return next_expect_member_string_null<T> (n.c_str (), su); + } + + // next_expect_member_number_null() + // + inline std::string* parser:: + next_expect_member_number_null (const char* n, bool su) + { + next_expect_name (n, su); + return next_expect_number_null (); + } + + inline std::string* parser:: + next_expect_member_number_null (const std::string& n, bool su) + { + return next_expect_member_number_null (n.c_str (), su); + } + + template <typename T> + inline optional<T> parser:: + next_expect_member_number_null (const char* n, bool su) + { + next_expect_name (n, su); + return next_expect_number_null<T> (); + } + + template <typename T> + inline optional<T> parser:: + next_expect_member_number_null (const std::string& n, bool su) + { + return next_expect_member_number_null<T> (n.c_str (), su); + } + + // next_expect_member_boolean_null() + // + inline std::string* parser:: + next_expect_member_boolean_null (const char* n, bool su) + { + next_expect_name (n, su); + return next_expect_boolean_null (); + } + + inline std::string* parser:: + next_expect_member_boolean_null (const std::string& n, bool su) + { + return next_expect_member_boolean_null (n.c_str (), su); + } + + template <typename T> + inline optional<T> parser:: + next_expect_member_boolean_null (const char* n, bool su) + { + next_expect_name (n, su); + return next_expect_boolean_null<T> (); + } + + template <typename T> + inline optional<T> parser:: + next_expect_member_boolean_null (const std::string& n, bool su) + { + return next_expect_member_boolean_null<T> (n.c_str (), su); + } + + // next_expect_member_object[_null]() + // + inline void parser:: + next_expect_member_object (const char* n, bool su) + { + next_expect_name (n, su); + next_expect (event::begin_object); + } + + inline void parser:: + next_expect_member_object (const std::string& n, bool su) + { + next_expect_member_object (n.c_str (), su); + } + + inline bool parser:: + next_expect_member_object_null (const char* n, bool su) + { + next_expect_name (n, su); + return next_expect (event::begin_object, event::null); + } + + inline bool parser:: + next_expect_member_object_null (const std::string& n, bool su) + { + return next_expect_member_object_null (n.c_str (), su); + } + + // next_expect_member_array[_null]() + // + inline void parser:: + next_expect_member_array (const char* n, bool su) + { + next_expect_name (n, su); + next_expect (event::begin_array); + } + + inline void parser:: + next_expect_member_array (const std::string& n, bool su) + { + next_expect_member_array (n.c_str (), su); + } + + inline bool parser:: + next_expect_member_array_null (const char* n, bool su) + { + next_expect_name (n, su); + return next_expect (event::begin_array, event::null); + } + + inline bool parser:: + next_expect_member_array_null (const std::string& n, bool su) + { + return next_expect_member_array_null (n.c_str (), su); + } + } +} diff --git a/libbutl/json/pdjson.c b/libbutl/json/pdjson.c new file mode 100644 index 0000000..ae10c95 --- /dev/null +++ b/libbutl/json/pdjson.c @@ -0,0 +1,1044 @@ +#ifndef _POSIX_C_SOURCE +# define _POSIX_C_SOURCE 200112L +#elif _POSIX_C_SOURCE < 200112L +# error incompatible _POSIX_C_SOURCE level +#endif + +#include <stdlib.h> +#include <string.h> +#include <ctype.h> + +#ifndef PDJSON_H +# include "pdjson.h" +#endif + +#define JSON_FLAG_ERROR (1u << 0) +#define JSON_FLAG_STREAMING (1u << 1) + +#if defined(_MSC_VER) && (_MSC_VER < 1900) + +#define json_error(json, format, ...) \ + if (!(json->flags & JSON_FLAG_ERROR)) { \ + json->flags |= JSON_FLAG_ERROR; \ + _snprintf_s(json->errmsg, sizeof(json->errmsg), \ + _TRUNCATE, \ + format, \ + __VA_ARGS__); \ + } \ + +#else + +#define json_error(json, format, ...) \ + if (!(json->flags & JSON_FLAG_ERROR)) { \ + json->flags |= JSON_FLAG_ERROR; \ + snprintf(json->errmsg, sizeof(json->errmsg), \ + format, \ + __VA_ARGS__); \ + } \ + +#endif /* _MSC_VER */ + +/* See also PDJSON_STACK_MAX below. */ +#ifndef PDJSON_STACK_INC +# define PDJSON_STACK_INC 4 +#endif + +struct json_stack { + enum json_type type; + long count; +}; + +static enum json_type +push(json_stream *json, enum json_type type) +{ + json->stack_top++; + +#ifdef PDJSON_STACK_MAX + if (json->stack_top > PDJSON_STACK_MAX) { + json_error(json, "%s", "maximum depth of nesting reached"); + return JSON_ERROR; + } +#endif + + if (json->stack_top >= json->stack_size) { + struct json_stack *stack; + size_t size = (json->stack_size + PDJSON_STACK_INC) * sizeof(*json->stack); + stack = (struct json_stack *)json->alloc.realloc(json->stack, size); + if (stack == NULL) { + json_error(json, "%s", "out of memory"); + return JSON_ERROR; + } + + json->stack_size += PDJSON_STACK_INC; + json->stack = stack; + } + + json->stack[json->stack_top].type = type; + json->stack[json->stack_top].count = 0; + + return type; +} + +/* Note: c is assumed not to be EOF. */ +static enum json_type +pop(json_stream *json, int c, enum json_type expected) +{ + if (json->stack == NULL || json->stack[json->stack_top].type != expected) { + json_error(json, "unexpected byte '%c'", c); + return JSON_ERROR; + } + json->stack_top--; + return expected == JSON_ARRAY ? JSON_ARRAY_END : JSON_OBJECT_END; +} + +static int buffer_peek(struct json_source *source) +{ + if (source->position < source->source.buffer.length) + return source->source.buffer.buffer[source->position]; + else + return EOF; +} + +static int buffer_get(struct json_source *source) +{ + int c = source->peek(source); + if (c != EOF) + source->position++; + return c; +} + +static int stream_get(struct json_source *source) +{ + int c = fgetc(source->source.stream.stream); + if (c != EOF) + source->position++; + return c; +} + +static int stream_peek(struct json_source *source) +{ + int c = fgetc(source->source.stream.stream); + ungetc(c, source->source.stream.stream); + return c; +} + +static void init(json_stream *json) +{ + json->lineno = 1; + json->linepos = 0; + json->lineadj = 0; + json->linecon = 0; + json->colno = 0; + json->flags = JSON_FLAG_STREAMING; + json->errmsg[0] = '\0'; + json->ntokens = 0; + json->next = (enum json_type)0; + + json->stack = NULL; + json->stack_top = -1; + json->stack_size = 0; + + json->data.string = NULL; + json->data.string_size = 0; + json->data.string_fill = 0; + json->source.position = 0; + + json->alloc.malloc = malloc; + json->alloc.realloc = realloc; + json->alloc.free = free; +} + +static enum json_type +is_match(json_stream *json, const char *pattern, enum json_type type) +{ + int c; + for (const char *p = pattern; *p; p++) { + if (*p != (c = json->source.get(&json->source))) { + if (c != EOF) { + json_error(json, "expected '%c' instead of byte '%c'", *p, c); + } else { + json_error(json, "expected '%c' instead of end of text", *p); + } + return JSON_ERROR; + } + } + return type; +} + +static int pushchar(json_stream *json, int c) +{ + if (json->data.string_fill == json->data.string_size) { + size_t size = json->data.string_size * 2; + char *buffer = (char *)json->alloc.realloc(json->data.string, size); + if (buffer == NULL) { + json_error(json, "%s", "out of memory"); + return -1; + } else { + json->data.string_size = size; + json->data.string = buffer; + } + } + json->data.string[json->data.string_fill++] = c; + return 0; +} + +static int init_string(json_stream *json) +{ + json->data.string_fill = 0; + if (json->data.string == NULL) { + json->data.string_size = 1024; + json->data.string = (char *)json->alloc.malloc(json->data.string_size); + if (json->data.string == NULL) { + json_error(json, "%s", "out of memory"); + return -1; + } + } + json->data.string[0] = '\0'; + return 0; +} + +static int encode_utf8(json_stream *json, unsigned long c) +{ + if (c < 0x80UL) { + return pushchar(json, c); + } else if (c < 0x0800UL) { + return !((pushchar(json, (c >> 6 & 0x1F) | 0xC0) == 0) && + (pushchar(json, (c >> 0 & 0x3F) | 0x80) == 0)); + } else if (c < 0x010000UL) { + if (c >= 0xd800 && c <= 0xdfff) { + json_error(json, "invalid codepoint %06lx", c); + return -1; + } + return !((pushchar(json, (c >> 12 & 0x0F) | 0xE0) == 0) && + (pushchar(json, (c >> 6 & 0x3F) | 0x80) == 0) && + (pushchar(json, (c >> 0 & 0x3F) | 0x80) == 0)); + } else if (c < 0x110000UL) { + return !((pushchar(json, (c >> 18 & 0x07) | 0xF0) == 0) && + (pushchar(json, (c >> 12 & 0x3F) | 0x80) == 0) && + (pushchar(json, (c >> 6 & 0x3F) | 0x80) == 0) && + (pushchar(json, (c >> 0 & 0x3F) | 0x80) == 0)); + } else { + json_error(json, "unable to encode %06lx as UTF-8", c); + return -1; + } +} + +static int hexchar(int c) +{ + switch (c) { + case '0': return 0; + case '1': return 1; + case '2': return 2; + case '3': return 3; + case '4': return 4; + case '5': return 5; + case '6': return 6; + case '7': return 7; + case '8': return 8; + case '9': return 9; + case 'a': + case 'A': return 10; + case 'b': + case 'B': return 11; + case 'c': + case 'C': return 12; + case 'd': + case 'D': return 13; + case 'e': + case 'E': return 14; + case 'f': + case 'F': return 15; + default: + return -1; + } +} + +static long +read_unicode_cp(json_stream *json) +{ + long cp = 0; + int shift = 12; + + for (size_t i = 0; i < 4; i++) { + int c = json->source.get(&json->source); + int hc; + + if (c == EOF) { + json_error(json, "%s", "unterminated string literal in Unicode"); + return -1; + } else if ((hc = hexchar(c)) == -1) { + json_error(json, "invalid escape Unicode byte '%c'", c); + return -1; + } + + cp += hc * (1 << shift); + shift -= 4; + } + + + return cp; +} + +static int read_unicode(json_stream *json) +{ + long cp, h, l; + + if ((cp = read_unicode_cp(json)) == -1) { + return -1; + } + + if (cp >= 0xd800 && cp <= 0xdbff) { + /* This is the high portion of a surrogate pair; we need to read the + * lower portion to get the codepoint + */ + h = cp; + + int c = json->source.get(&json->source); + if (c == EOF) { + json_error(json, "%s", "unterminated string literal in Unicode"); + return -1; + } else if (c != '\\') { + json_error(json, "invalid continuation for surrogate pair '%c', " + "expected '\\'", c); + return -1; + } + + c = json->source.get(&json->source); + if (c == EOF) { + json_error(json, "%s", "unterminated string literal in Unicode"); + return -1; + } else if (c != 'u') { + json_error(json, "invalid continuation for surrogate pair '%c', " + "expected 'u'", c); + return -1; + } + + if ((l = read_unicode_cp(json)) == -1) { + return -1; + } + + if (l < 0xdc00 || l > 0xdfff) { + json_error(json, "surrogate pair continuation \\u%04lx out " + "of range (dc00-dfff)", l); + return -1; + } + + cp = ((h - 0xd800) * 0x400) + ((l - 0xdc00) + 0x10000); + } else if (cp >= 0xdc00 && cp <= 0xdfff) { + json_error(json, "dangling surrogate \\u%04lx", cp); + return -1; + } + + return encode_utf8(json, cp); +} + +static int +read_escaped(json_stream *json) +{ + int c = json->source.get(&json->source); + if (c == EOF) { + json_error(json, "%s", "unterminated string literal in escape"); + return -1; + } else if (c == 'u') { + if (read_unicode(json) != 0) + return -1; + } else { + switch (c) { + case '\\': + case 'b': + case 'f': + case 'n': + case 'r': + case 't': + case '/': + case '"': + { + const char *codes = "\\bfnrt/\""; + const char *p = strchr(codes, c); + if (pushchar(json, "\\\b\f\n\r\t/\""[p - codes]) != 0) + return -1; + } + break; + default: + json_error(json, "invalid escaped byte '%c'", c); + return -1; + } + } + return 0; +} + +static int +char_needs_escaping(int c) +{ + if ((c >= 0) && (c < 0x20 || c == 0x22 || c == 0x5c)) { + return 1; + } + + return 0; +} + +static int +utf8_seq_length(char byte) +{ + unsigned char u = (unsigned char) byte; + if (u < 0x80) return 1; + + if (0x80 <= u && u <= 0xBF) + { + // second, third or fourth byte of a multi-byte + // sequence, i.e. a "continuation byte" + return 0; + } + else if (u == 0xC0 || u == 0xC1) + { + // overlong encoding of an ASCII byte + return 0; + } + else if (0xC2 <= u && u <= 0xDF) + { + // 2-byte sequence + return 2; + } + else if (0xE0 <= u && u <= 0xEF) + { + // 3-byte sequence + return 3; + } + else if (0xF0 <= u && u <= 0xF4) + { + // 4-byte sequence + return 4; + } + else + { + // u >= 0xF5 + // Restricted (start of 4-, 5- or 6-byte sequence) or invalid UTF-8 + return 0; + } +} + +static int +is_legal_utf8(const unsigned char *bytes, int length) +{ + if (0 == bytes || 0 == length) return 0; + + unsigned char a; + const unsigned char* srcptr = bytes + length; + switch (length) + { + default: + return 0; + // Everything else falls through when true. + case 4: + if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0; + /* FALLTHRU */ + case 3: + if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0; + /* FALLTHRU */ + case 2: + a = (*--srcptr); + switch (*bytes) + { + case 0xE0: + if (a < 0xA0 || a > 0xBF) return 0; + break; + case 0xED: + if (a < 0x80 || a > 0x9F) return 0; + break; + case 0xF0: + if (a < 0x90 || a > 0xBF) return 0; + break; + case 0xF4: + if (a < 0x80 || a > 0x8F) return 0; + break; + default: + if (a < 0x80 || a > 0xBF) return 0; + break; + } + /* FALLTHRU */ + case 1: + if (*bytes >= 0x80 && *bytes < 0xC2) return 0; + } + return *bytes <= 0xF4; +} + +static int +read_utf8(json_stream* json, int next_char) +{ + int count = utf8_seq_length(next_char); + if (!count) + { + json_error(json, "%s", "invalid UTF-8 character"); + return -1; + } + + char buffer[4]; + buffer[0] = next_char; + int i; + for (i = 1; i < count; ++i) + { + if ((next_char = json->source.get(&json->source)) == EOF) + break; + + buffer[i] = next_char; + json->lineadj++; + } + + if (i != count || !is_legal_utf8((unsigned char*) buffer, count)) + { + json_error(json, "%s", "invalid UTF-8 text"); + return -1; + } + + for (i = 0; i < count; ++i) + { + if (pushchar(json, buffer[i]) != 0) + return -1; + } + return 0; +} + +static enum json_type +read_string(json_stream *json) +{ + if (init_string(json) != 0) + return JSON_ERROR; + while (1) { + int c = json->source.get(&json->source); + if (c == EOF) { + json_error(json, "%s", "unterminated string literal"); + return JSON_ERROR; + } else if (c == '"') { + if (pushchar(json, '\0') == 0) + return JSON_STRING; + else + return JSON_ERROR; + } else if (c == '\\') { + if (read_escaped(json) != 0) + return JSON_ERROR; + } else if ((unsigned) c >= 0x80) { + if (read_utf8(json, c) != 0) + return JSON_ERROR; + } else { + if (char_needs_escaping(c)) { + json_error(json, "%s", "unescaped control character in string"); + return JSON_ERROR; + } + + if (pushchar(json, c) != 0) + return JSON_ERROR; + } + } + return JSON_ERROR; +} + +static int +is_digit(int c) +{ + return c >= 48 /*0*/ && c <= 57 /*9*/; +} + +static int +read_digits(json_stream *json) +{ + int c; + unsigned nread = 0; + while (is_digit(c = json->source.peek(&json->source))) { + if (pushchar(json, json->source.get(&json->source)) != 0) + return -1; + + nread++; + } + + if (nread == 0) { + if (c != EOF) { + json_error(json, "expected digit instead of byte '%c'", c); + } else { + json_error(json, "%s", "expected digit instead of end of text"); + } + return -1; + } + + return 0; +} + +static enum json_type +read_number(json_stream *json, int c) +{ + if (pushchar(json, c) != 0) + return JSON_ERROR; + if (c == '-') { + c = json->source.get(&json->source); + if (is_digit(c)) { + return read_number(json, c); + } else { + if (c != EOF) { + json_error(json, "unexpected byte '%c' in number", c); + } else { + json_error(json, "%s", "unexpected end of text in number"); + } + return JSON_ERROR; + } + } else if (strchr("123456789", c) != NULL) { + c = json->source.peek(&json->source); + if (is_digit(c)) { + if (read_digits(json) != 0) + return JSON_ERROR; + } + } + /* Up to decimal or exponent has been read. */ + c = json->source.peek(&json->source); + if (strchr(".eE", c) == NULL) { + if (pushchar(json, '\0') != 0) + return JSON_ERROR; + else + return JSON_NUMBER; + } + if (c == '.') { + json->source.get(&json->source); // consume . + if (pushchar(json, c) != 0) + return JSON_ERROR; + if (read_digits(json) != 0) + return JSON_ERROR; + } + /* Check for exponent. */ + c = json->source.peek(&json->source); + if (c == 'e' || c == 'E') { + json->source.get(&json->source); // consume e/E + if (pushchar(json, c) != 0) + return JSON_ERROR; + c = json->source.peek(&json->source); + if (c == '+' || c == '-') { + json->source.get(&json->source); // consume + if (pushchar(json, c) != 0) + return JSON_ERROR; + if (read_digits(json) != 0) + return JSON_ERROR; + } else if (is_digit(c)) { + if (read_digits(json) != 0) + return JSON_ERROR; + } else { + json->source.get(&json->source); // consume (for column) + if (c != EOF) { + json_error(json, "unexpected byte '%c' in number", c); + } else { + json_error(json, "%s", "unexpected end of text in number"); + } + return JSON_ERROR; + } + } + if (pushchar(json, '\0') != 0) + return JSON_ERROR; + else + return JSON_NUMBER; +} + +bool +json_isspace(int c) +{ + switch (c) { + case 0x09: + case 0x0a: + case 0x0d: + case 0x20: + return true; + } + + return false; +} + +static void newline(json_stream *json) +{ + json->lineno++; + json->linepos = json->source.position; + json->lineadj = 0; + json->linecon = 0; +} + +/* Returns the next non-whitespace character in the stream. + * + * Note that this is the only function (besides user-facing json_source_get()) + * that needs to worry about newline housekeeping. + */ +static int next(json_stream *json) +{ + int c; + while (json_isspace(c = json->source.get(&json->source))) + if (c == '\n') + newline(json); + return c; +} + +static enum json_type +read_value(json_stream *json, int c) +{ + enum json_type type; + size_t colno = json_get_column(json); + + json->ntokens++; + + switch (c) { + case EOF: + json_error(json, "%s", "unexpected end of text"); + type = JSON_ERROR; + break; + case '{': + type = push(json, JSON_OBJECT); + break; + case '[': + type = push(json, JSON_ARRAY); + break; + case '"': + type = read_string(json); + break; + case 'n': + type = is_match(json, "ull", JSON_NULL); + break; + case 'f': + type = is_match(json, "alse", JSON_FALSE); + break; + case 't': + type = is_match(json, "rue", JSON_TRUE); + break; + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + case '-': + type = init_string(json) == 0 ? read_number(json, c) : JSON_ERROR; + break; + default: + type = JSON_ERROR; + json_error(json, "unexpected byte '%c' in value", c); + break; + } + + if (type != JSON_ERROR) + json->colno = colno; + + return type; +} + +enum json_type json_peek(json_stream *json) +{ + enum json_type next; + if (json->next) + next = json->next; + else + next = json->next = json_next(json); + return next; +} + +enum json_type json_next(json_stream *json) +{ + if (json->flags & JSON_FLAG_ERROR) + return JSON_ERROR; + if (json->next != 0) { + enum json_type next = json->next; + json->next = (enum json_type)0; + return next; + } + + json->colno = 0; + + if (json->ntokens > 0 && json->stack_top == (size_t)-1) { + + /* In the streaming mode leave any trailing whitespaces in the stream. + * This allows the user to validate any desired separation between + * values (such as newlines) using json_source_get/peek() with any + * remaining whitespaces ignored as leading when we parse the next + * value. */ + if (!(json->flags & JSON_FLAG_STREAMING)) { + int c = next(json); + if (c != EOF) { + json_error(json, "expected end of text instead of byte '%c'", c); + return JSON_ERROR; + } + } + + return JSON_DONE; + } + int c = next(json); + if (json->stack_top == (size_t)-1) { + if (c == EOF && (json->flags & JSON_FLAG_STREAMING)) + return JSON_DONE; + + return read_value(json, c); + } + if (json->stack[json->stack_top].type == JSON_ARRAY) { + if (json->stack[json->stack_top].count == 0) { + if (c == ']') { + return pop(json, c, JSON_ARRAY); + } + json->stack[json->stack_top].count++; + return read_value(json, c); + } else if (c == ',') { + json->stack[json->stack_top].count++; + return read_value(json, next(json)); + } else if (c == ']') { + return pop(json, c, JSON_ARRAY); + } else { + if (c != EOF) { + json_error(json, "unexpected byte '%c'", c); + } else { + json_error(json, "%s", "unexpected end of text"); + } + return JSON_ERROR; + } + } else if (json->stack[json->stack_top].type == JSON_OBJECT) { + if (json->stack[json->stack_top].count == 0) { + if (c == '}') { + return pop(json, c, JSON_OBJECT); + } + + /* No member name/value pairs yet. */ + enum json_type value = read_value(json, c); + if (value != JSON_STRING) { + if (value != JSON_ERROR) + json_error(json, "%s", "expected member name or '}'"); + return JSON_ERROR; + } else { + json->stack[json->stack_top].count++; + return value; + } + } else if ((json->stack[json->stack_top].count % 2) == 0) { + /* Expecting comma followed by member name. */ + if (c != ',' && c != '}') { + json_error(json, "%s", "expected ',' or '}' after member value"); + return JSON_ERROR; + } else if (c == '}') { + return pop(json, c, JSON_OBJECT); + } else { + enum json_type value = read_value(json, next(json)); + if (value != JSON_STRING) { + if (value != JSON_ERROR) + json_error(json, "%s", "expected member name"); + return JSON_ERROR; + } else { + json->stack[json->stack_top].count++; + return value; + } + } + } else if ((json->stack[json->stack_top].count % 2) == 1) { + /* Expecting colon followed by value. */ + if (c != ':') { + json_error(json, "%s", "expected ':' after member name"); + return JSON_ERROR; + } else { + json->stack[json->stack_top].count++; + return read_value(json, next(json)); + } + } + } + json_error(json, "%s", "invalid parser state"); + return JSON_ERROR; +} + +void json_reset(json_stream *json) +{ + json->stack_top = -1; + json->ntokens = 0; + json->flags &= ~JSON_FLAG_ERROR; + json->errmsg[0] = '\0'; +} + +enum json_type json_skip(json_stream *json) +{ + enum json_type type = json_next(json); + size_t cnt_arr = 0; + size_t cnt_obj = 0; + + for (enum json_type skip = type; ; skip = json_next(json)) { + if (skip == JSON_ERROR || skip == JSON_DONE) + return skip; + + if (skip == JSON_ARRAY) { + ++cnt_arr; + } else if (skip == JSON_ARRAY_END && cnt_arr > 0) { + --cnt_arr; + } else if (skip == JSON_OBJECT) { + ++cnt_obj; + } else if (skip == JSON_OBJECT_END && cnt_obj > 0) { + --cnt_obj; + } + + if (!cnt_arr && !cnt_obj) + break; + } + + return type; +} + +enum json_type json_skip_until(json_stream *json, enum json_type type) +{ + while (1) { + enum json_type skip = json_skip(json); + + if (skip == JSON_ERROR || skip == JSON_DONE) + return skip; + + if (skip == type) + break; + } + + return type; +} + +const char *json_get_string(json_stream *json, size_t *length) +{ + if (length != NULL) + *length = json->data.string_fill; + if (json->data.string == NULL) + return ""; + else + return json->data.string; +} + +double json_get_number(json_stream *json) +{ + char *p = json->data.string; + return p == NULL ? 0 : strtod(p, NULL); +} + +const char *json_get_error(json_stream *json) +{ + return json->flags & JSON_FLAG_ERROR ? json->errmsg : NULL; +} + +size_t json_get_lineno(json_stream *json) +{ + return json->lineno; +} + +size_t json_get_position(json_stream *json) +{ + return json->source.position; +} + +size_t json_get_column(json_stream *json) +{ + return json->colno == 0 + ? json->source.position == 0 ? 1 : json->source.position - json->linepos - json->lineadj + : json->colno; +} + +size_t json_get_depth(json_stream *json) +{ + return json->stack_top + 1; +} + +/* Return the current parsing context, that is, JSON_OBJECT if we are inside + an object, JSON_ARRAY if we are inside an array, and JSON_DONE if we are + not yet/anymore in either. + + Additionally, for the first two cases, also return the number of parsing + events that have already been observed at this level with json_next/peek(). + In particular, inside an object, an odd number would indicate that the just + observed JSON_STRING event is a member name. +*/ +enum json_type json_get_context(json_stream *json, size_t *count) +{ + if (json->stack_top == (size_t)-1) + return JSON_DONE; + + if (count != NULL) + *count = json->stack[json->stack_top].count; + + return json->stack[json->stack_top].type; +} + +int json_source_get(json_stream *json) +{ + /* If the caller reads a multi-byte UTF-8 sequence, we expect them to read + * it in its entirety. We also assume that any invalid bytes within such a + * sequence belong to the same column (as opposed to starting a new column + * or some such). */ + + int c = json->source.get(&json->source); + if (json->linecon > 0) { + /* Expecting a continuation byte within a multi-byte UTF-8 sequence. */ + json->linecon--; + if (c != EOF) + json->lineadj++; + } else if (c == '\n') + newline(json); + else if (c >= 0xC2 && c <= 0xF4) /* First in multi-byte UTF-8 sequence. */ + json->linecon = utf8_seq_length(c) - 1; + + return c; +} + +int json_source_peek(json_stream *json) +{ + return json->source.peek(&json->source); +} + +void json_open_buffer(json_stream *json, const void *buffer, size_t size) +{ + init(json); + json->source.get = buffer_get; + json->source.peek = buffer_peek; + json->source.source.buffer.buffer = (const char *)buffer; + json->source.source.buffer.length = size; +} + +void json_open_string(json_stream *json, const char *string) +{ + json_open_buffer(json, string, strlen(string)); +} + +void json_open_stream(json_stream *json, FILE * stream) +{ + init(json); + json->source.get = stream_get; + json->source.peek = stream_peek; + json->source.source.stream.stream = stream; +} + +static int user_get(struct json_source *json) +{ + int c = json->source.user.get(json->source.user.ptr); + if (c != EOF) + json->position++; + return c; +} + +static int user_peek(struct json_source *json) +{ + return json->source.user.peek(json->source.user.ptr); +} + +void json_open_user(json_stream *json, json_user_io get, json_user_io peek, void *user) +{ + init(json); + json->source.get = user_get; + json->source.peek = user_peek; + json->source.source.user.ptr = user; + json->source.source.user.get = get; + json->source.source.user.peek = peek; +} + +void json_set_allocator(json_stream *json, json_allocator *a) +{ + json->alloc = *a; +} + +void json_set_streaming(json_stream *json, bool streaming) +{ + if (streaming) + json->flags |= JSON_FLAG_STREAMING; + else + json->flags &= ~JSON_FLAG_STREAMING; +} + +void json_close(json_stream *json) +{ + json->alloc.free(json->stack); + json->alloc.free(json->data.string); +} diff --git a/libbutl/json/pdjson.h b/libbutl/json/pdjson.h new file mode 100644 index 0000000..ac698e4 --- /dev/null +++ b/libbutl/json/pdjson.h @@ -0,0 +1,147 @@ +#ifndef PDJSON_H +#define PDJSON_H + +#ifndef PDJSON_SYMEXPORT +# define PDJSON_SYMEXPORT +#endif + +#ifdef __cplusplus +extern "C" { +#else +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) + #include <stdbool.h> +#else + #ifndef bool + #define bool int + #define true 1 + #define false 0 + #endif /* bool */ +#endif /* __STDC_VERSION__ */ +#endif /* __cplusplus */ + +#include <stdio.h> + +enum json_type { + JSON_ERROR = 1, JSON_DONE, + JSON_OBJECT, JSON_OBJECT_END, JSON_ARRAY, JSON_ARRAY_END, + JSON_STRING, JSON_NUMBER, JSON_TRUE, JSON_FALSE, JSON_NULL +}; + +struct json_allocator { + void *(*malloc)(size_t); + void *(*realloc)(void *, size_t); + void (*free)(void *); +}; + +typedef int (*json_user_io)(void *user); + +typedef struct json_stream json_stream; +typedef struct json_allocator json_allocator; + +PDJSON_SYMEXPORT void json_open_buffer(json_stream *json, const void *buffer, size_t size); +PDJSON_SYMEXPORT void json_open_string(json_stream *json, const char *string); +PDJSON_SYMEXPORT void json_open_stream(json_stream *json, FILE *stream); +PDJSON_SYMEXPORT void json_open_user(json_stream *json, json_user_io get, json_user_io peek, void *user); +PDJSON_SYMEXPORT void json_close(json_stream *json); + +PDJSON_SYMEXPORT void json_set_allocator(json_stream *json, json_allocator *a); +PDJSON_SYMEXPORT void json_set_streaming(json_stream *json, bool mode); + +PDJSON_SYMEXPORT enum json_type json_next(json_stream *json); +PDJSON_SYMEXPORT enum json_type json_peek(json_stream *json); +PDJSON_SYMEXPORT void json_reset(json_stream *json); +PDJSON_SYMEXPORT const char *json_get_string(json_stream *json, size_t *length); +PDJSON_SYMEXPORT double json_get_number(json_stream *json); + +PDJSON_SYMEXPORT enum json_type json_skip(json_stream *json); +PDJSON_SYMEXPORT enum json_type json_skip_until(json_stream *json, enum json_type type); + +PDJSON_SYMEXPORT size_t json_get_lineno(json_stream *json); +PDJSON_SYMEXPORT size_t json_get_position(json_stream *json); +PDJSON_SYMEXPORT size_t json_get_column(json_stream *json); +PDJSON_SYMEXPORT size_t json_get_depth(json_stream *json); +PDJSON_SYMEXPORT enum json_type json_get_context(json_stream *json, size_t *count); +PDJSON_SYMEXPORT const char *json_get_error(json_stream *json); + +PDJSON_SYMEXPORT int json_source_get(json_stream *json); +PDJSON_SYMEXPORT int json_source_peek(json_stream *json); +PDJSON_SYMEXPORT bool json_isspace(int c); + +/* internal */ + +struct json_source { + int (*get)(struct json_source *); + int (*peek)(struct json_source *); + size_t position; + union { + struct { + FILE *stream; + } stream; + struct { + const char *buffer; + size_t length; + } buffer; + struct { + void *ptr; + json_user_io get; + json_user_io peek; + } user; + } source; +}; + +struct json_stream { + size_t lineno; + + /* While counting lines is straightforward, columns are tricky because we + * have to count codepoints, not bytes. We could have peppered the code + * with increments in all the relevant places but that seems inelegant. + * So instead we calculate the column dynamically, based on the current + * position. + * + * Specifically, we will remember the position at the beginning of each + * line (linepos) and, assuming only the ASCII characters on the line, the + * column will be the difference between the current position and linepos. + * Of course there could also be multi-byte UTF-8 sequences which we will + * handle by keeping an adjustment (lineadj) -- the number of continuation + * bytes encountered on this line so far. Finally, for json_source_get() + * we also have to keep the number of remaining continuation bytes in the + * current multi-byte UTF-8 sequence (linecon). + * + * This is not the end of the story, however: with only the just described + * approach we will always end up with the column of the latest character + * read which is not what we want when returning potentially multi- + * character value events (string, number, etc); in these cases we want to + * return the column of the first character (note that if the value itself + * is invalid and we are returning JSON_ERROR, we still want the current + * column). So to handle this we will cache the start column (colno) for + * such events. + */ + size_t linepos; /* Position at the beginning of the current line. */ + size_t lineadj; /* Adjustment for multi-byte UTF-8 sequences. */ + size_t linecon; /* Number of remaining UTF-8 continuation bytes. */ + size_t colno; /* Start column for value events or 0. */ + + struct json_stack *stack; + size_t stack_top; + size_t stack_size; + enum json_type next; + unsigned flags; + + struct { + char *string; + size_t string_fill; + size_t string_size; + } data; + + size_t ntokens; + + struct json_source source; + struct json_allocator alloc; + char errmsg[128]; +}; + +#ifdef __cplusplus +} /* extern "C" */ +#endif /* __cplusplus */ + +#endif diff --git a/libbutl/json/serializer.cxx b/libbutl/json/serializer.cxx new file mode 100644 index 0000000..fbd569a --- /dev/null +++ b/libbutl/json/serializer.cxx @@ -0,0 +1,671 @@ +#include <cstdio> // snprintf +#include <cstdarg> // va_list +#include <cstring> // memcpy +#include <ostream> + +#include <libbutl/json/serializer.hxx> + +using namespace std; + +namespace butl +{ + namespace json + { + using buffer = buffer_serializer::buffer; + using error_code = invalid_json_output::error_code; + + template <typename T> + static void + dynarray_overflow (void* d, event, buffer& b, size_t ex) + { + T& v (*static_cast<T*> (d)); + v.resize (b.capacity + ex); + v.resize (v.capacity ()); + // const_cast is required for std::string pre C++17. + // + b.data = const_cast<typename T::value_type*> (v.data ()); + b.capacity = v.size (); + } + + template <typename T> + static void + dynarray_flush (void* d, event, buffer& b) + { + T& v (*static_cast<T*> (d)); + v.resize (b.size); + b.data = const_cast<typename T::value_type*> (v.data ()); + b.capacity = b.size; + } + + buffer_serializer:: + buffer_serializer (string& s, size_t i) + : buffer_serializer (const_cast<char*> (s.data ()), size_, s.size (), + dynarray_overflow<string>, + dynarray_flush<string>, + &s, + i) + { + size_ = s.size (); + } + + buffer_serializer:: + buffer_serializer (vector<char>& v, size_t i) + : buffer_serializer (v.data (), size_, v.size (), + dynarray_overflow<vector<char>>, + dynarray_flush<vector<char>>, + &v, + i) + { + size_ = v.size (); + } + + static void + ostream_overflow (void* d, event e, buffer& b, size_t) + { + ostream& s (*static_cast<ostream*> (d)); + s.write (static_cast<char*> (b.data), b.size); + if (s.fail ()) + throw invalid_json_output ( + e, error_code::buffer_overflow, "unable to write JSON output text"); + b.size = 0; + } + + static void + ostream_flush (void* d, event e, buffer& b) + { + ostream_overflow (d, e, b, 0); + + ostream& s (*static_cast<ostream*> (d)); + s.flush (); + if (s.fail ()) + throw invalid_json_output ( + e, error_code::buffer_overflow, "unable to write JSON output text"); + } + + stream_serializer:: + stream_serializer (ostream& os, size_t i) + : buffer_serializer (tmp_, sizeof (tmp_), + ostream_overflow, + ostream_flush, + &os, + i) + { + } + + bool buffer_serializer:: + next (optional<event> e, pair<const char*, size_t> val, bool check) + { + if (absent_ == 2) + goto fail_complete; + + if (e == nullopt) + { + if (!state_.empty ()) + goto fail_incomplete; + + absent_++; + return false; + } + + absent_ = 0; // Clear inter-value absent event. + + { + state* st (state_.empty () ? nullptr : &state_.back ()); + + auto name_expected = [] (const state& s) + { + return s.type == event::begin_object && s.count % 2 == 0; + }; + + auto make_str = [] (const char* s, size_t n) + { + return make_pair (s, n); + }; + + // When it comes to pretty-printing, the common way to do it is along + // these lines: + // + // { + // "str": "value", + // "obj": { + // "arr": [ + // 1, + // 2, + // 3 + // ] + // }, + // "num": 123 + // } + // + // Empty objects and arrays are printed without a newline: + // + // { + // "obj": {}, + // "arr": [] + // } + // + // There are two types of separators: between name and value, which is + // always ": ", and before/after value inside an object or array which + // is either newline followed by indentation, or comma followed by + // newline followed by indentation (we also have separation between + // top-level values but that's orthogonal to pretty-printing). + // + // Based on this observation, we are going to handle the latter case by + // starting with the ",\n" string (in this->sep_) and pushing/popping + // indentation spaces as we enter/leave objects and arrays. We handle + // the cases where we don't need the comma by simply skipping it in the + // C-string pointer. + // + bool pp (indent_ != 0); + + pair<const char*, size_t> sep; + if (st != nullptr) + { + // The name-value separator. + // + if (st->type == event::begin_object && st->count % 2 == 1) + { + sep = !pp ? make_str (":", 1) : make_str (": ", 2); + } + // We don't need the comma if we are closing the object or array. + // + else if (e == event::end_array || e == event::end_object) + { + // But in this case we need to unindent one level prior to writing + // the brace. Also handle the empty object/array as a special case. + // + sep = !pp || st->count == 0 + ? make_str (nullptr, 0) + : make_str (sep_.c_str () + 1, sep_.size () - 1 - indent_); + } + // Or if this is the first value (note: must come after end_*). + // + else if (st->count == 0) + { + sep = !pp + ? make_str (nullptr, 0) + : make_str (sep_.c_str () + 1, sep_.size () - 1); + } + else + { + sep = !pp + ? make_str (",", 1) + : make_str (sep_.c_str (), sep_.size ()); + } + } + else if (values_ != 0) // Subsequent top-level value. + { + // Top-level value separation. For now we always separate them with + // newlines, which is the most common/sensible way. + // + sep = make_str ("\n", 1); + } + + switch (*e) + { + case event::begin_array: + case event::begin_object: + { + if (st != nullptr && name_expected (*st)) + goto fail_unexpected_event; + + write (*e, + sep, + make_str (e == event::begin_array ? "[" : "{", 1), + false); + + if (st != nullptr) + st->count++; + + if (pp) + sep_.append (indent_, ' '); + + state_.push_back (state {*e, 0}); + break; + } + case event::end_array: + case event::end_object: + { + if (st == nullptr || (e == event::end_array + ? st->type != event::begin_array + : !name_expected (*st))) + goto fail_unexpected_event; + + write (*e, + sep, + make_str (e == event::end_array ? "]" : "}", 1), + false); + + if (pp) + sep_.erase (sep_.size () - indent_); + + state_.pop_back (); + break; + } + case event::name: + case event::string: + { + if (e == event::name + ? (st == nullptr || !name_expected (*st)) + : (st != nullptr && name_expected (*st))) + goto fail_unexpected_event; + + write (*e, sep, val, check, '"'); + + if (st != nullptr) + st->count++; + break; + } + case event::null: + case event::boolean: + { + if (e == event::null && val.first == nullptr) + val = {"null", 4}; + else if (check) + { + auto eq = [&val] (const char* v, size_t n) + { + return val.second == n && memcmp (val.first, v, n) == 0; + }; + + if (e == event::null) + { + if (!eq ("null", 4)) + goto fail_null; + } + else + { + if (!eq ("true", 4) && !eq ("false", 5)) + goto fail_bool; + } + } + } + // Fall through. + case event::number: + { + // Note: this event is also used by value_json_text(). + + if (st != nullptr && name_expected (*st)) + goto fail_unexpected_event; + + write (*e, sep, val, check); + + if (st != nullptr) + st->count++; + break; + } + } + } + + if (state_.empty ()) + { + values_++; + if (flush_ != nullptr) + flush_ (data_, *e, buf_); + + return false; + } + + return true; + + fail_complete: + throw invalid_json_output ( + e, error_code::invalid_value, "value sequence is complete"); + fail_incomplete: + throw invalid_json_output ( + e, error_code::invalid_value, "value is incomplete"); + fail_null: + throw invalid_json_output ( + e, error_code::invalid_value, "invalid null value"); + fail_bool: + throw invalid_json_output ( + e, error_code::invalid_value, "invalid boolean value"); + fail_unexpected_event: + throw invalid_json_output ( + e, error_code::unexpected_event, "unexpected event"); + } + + // JSON escape sequences for control characters <= 0x1F. + // + static const char* json_escapes[] = + {"\\u0000", "\\u0001", "\\u0002", "\\u0003", "\\u0004", "\\u0005", + "\\u0006", "\\u0007", "\\b", "\\t", "\\n", "\\u000B", + "\\f", "\\r", "\\u000E", "\\u000F", "\\u0010", "\\u0011", + "\\u0012", "\\u0013", "\\u0014", "\\u0015", "\\u0016", "\\u0017", + "\\u0018", "\\u0019", "\\u001A", "\\u001B", "\\u001C", "\\u001D", + "\\u001E", "\\u001F"}; + + void buffer_serializer:: + write (event e, + pair<const char*, size_t> sep, + pair<const char*, size_t> val, + bool check, + char q) + { + // Assumptions: + // + // 1. A call to overflow should be able to provide enough capacity to + // write the entire separator (in other words, we are not going to + // bother with chunking the separator). + // + // 2. Similarly, a call to overflow should be able to provide enough + // capacity to write an entire UTF-8 multi-byte sequence. + // + // 3. Performance-wise, we do not expect very long contiguous sequences + // of character that require escaping. + + // Total number of bytes remaining to be written and the capacity + // currently available. + // + size_t size (sep.second + val.second + (q != '\0' ? 2 : 0)); + size_t cap (buf_.capacity - buf_.size); + + auto grow = [this, e, &size, &cap] (size_t min, size_t extra = 0) + { + if (overflow_ == nullptr) + return false; + + extra += size; + extra -= cap; + overflow_ (data_, e, buf_, extra > min ? extra : min); + cap = buf_.capacity - buf_.size; + + return cap >= min; + }; + + auto append = [this, &cap, &size] (const char* d, size_t s) + { + memcpy (static_cast<char*> (buf_.data) + buf_.size, d, s); + buf_.size += s; + cap -= s; + size -= s; + }; + + // Return the longest chunk of input that fits into the buffer and does + // not end in the middle of a multi-byte UTF-8 sequence. Assume value + // size and capacity are not 0. Return NULL in first if no chunk could + // be found that fits into the remaining space. In this case, second is + // the additional (to size) required space (used to handle escapes in + // the checked version). + // + // The basic idea is to seek in the input buffer to the capacity of the + // output buffer (unless the input is shorter than the output). If we + // ended up in the middle of a multi-byte UTF-8 sequence, then seek back + // until we end up at the UTF-8 sequence boundary. Note that this + // implementation assumes valid UTF-8. + // + auto chunk = [&cap, &val] () -> pair<const char*, size_t> + { + pair<const char*, size_t> r (nullptr, 0); + + if (cap >= val.second) + r = val; + else + { + // Start from the character past capacity and search for a UTF-8 + // sequence boundary. + // + for (const char* p (val.first + cap); p != val.first; --p) + { + const auto u (static_cast<uint8_t> (*p)); + if (u < 0x80 || u > 0xBF) // Not a continuation byte + { + r = {val.first, p - val.first}; + break; + } + } + } + + val.first += r.second; + val.second -= r.second; + + return r; + }; + + // Escaping and UTF-8-validating version of chunk(). + // + // There are three classes of mandatory escapes in a JSON string: + // + // - \\ and \" + // + // - \b \f \n \r \t for popular control characters + // + // - \u00NN for other control characters <= 0x1F + // + // If the input begins with a character that must be escaped, return + // only its escape sequence. Otherwise validate and return everything up + // to the end of input or buffer capacity, but cutting it short before + // the next character that must be escaped or the first UTF-8 sequence + // that would not fit. + // + // Return string::npos in second in case of a stray continuation byte or + // any byte in an invalid UTF-8 range (for example, an "overlong" 2-byte + // encoding of a 7-bit/ASCII character or a 4-, 5-, or 6-byte sequence + // that would encode a codepoint beyond the U+10FFFF Unicode limit). + // + auto chunk_checked = [&cap, &size, &val] () -> pair<const char*, size_t> + { + pair<const char*, size_t> r (nullptr, 0); + + // Check whether the first character needs to be escaped. + // + const uint8_t c (val.first[0]); + if (c == '"') + r = {"\\\"", 2}; + else if (c == '\\') + r = {"\\\\", 2}; + else if (c <= 0x1F) + { + auto s (json_escapes[c]); + r = {s, s[1] == 'u' ? 6 : 2}; + } + + if (r.first != nullptr) + { + // Return in second the additional (to size) space required. + // + if (r.second > cap) + return {nullptr, r.second - 1}; + + // If we had to escape the character then adjust size accordingly + // (see append() above). + // + size += r.second - 1; + + val.first += 1; + val.second -= 1; + return r; + } + + // First character doesn't need to be escaped. Return as much of the + // rest of the input as possible. + // + size_t i (0); + for (size_t n (min (cap, val.second)); i != n; i++) + { + const uint8_t c1 (val.first[i]); + + if (c1 == '"' || c1 == '\\' || c1 <= 0x1F) // Needs to be escaped. + break; + else if (c1 >= 0x80) // Not ASCII, so validate as a UTF-8 sequence. + { + size_t i1 (i); // Position of the first byte. + + // The control flow here is to continue if valid and to fall + // through to return on error. + // + if (c1 >= 0xC2 && c1 <= 0xDF) // 2-byte sequence. + { + if (i + 2 <= val.second) // Sequence is complete in JSON value. + { + if (i + 2 > cap) // Sequence won't fit. + break; + + const uint8_t c2 (val.first[++i]); + + if (c2 >= 0x80 && c2 <= 0xBF) + continue; + } + } + else if (c1 >= 0xE0 && c1 <= 0xEF) // 3-byte sequence. + { + if (i + 3 <= val.second) + { + if (i + 3 > cap) + break; + + const uint8_t c2 (val.first[++i]), c3 (val.first[++i]); + + if (c3 >= 0x80 && c3 <= 0xBF) + { + switch (c1) + { + case 0xE0: if (c2 >= 0xA0 && c2 <= 0xBF) continue; break; + case 0xED: if (c2 >= 0x80 && c2 <= 0x9F) continue; break; + default: if (c2 >= 0x80 && c2 <= 0xBF) continue; break; + } + } + } + } + else if (c1 >= 0xF0 && c1 <= 0xF4) // 4-byte sequence. + { + if (i + 4 <= val.second) + { + if (i + 4 > cap) + break; + + const uint8_t c2 (val.first[++i]), + c3 (val.first[++i]), + c4 (val.first[++i]); + + if (c3 >= 0x80 && c3 <= 0xBF && + c4 >= 0x80 && c4 <= 0xBF) + { + switch (c1) + { + case 0xF0: if (c2 >= 0x90 && c2 <= 0xBF) continue; break; + case 0xF4: if (c2 >= 0x80 && c2 <= 0x8F) continue; break; + default: if (c2 >= 0x80 && c2 <= 0xBF) continue; break; + } + } + } + } + + r = {val.first, string::npos}; + + // Update val to point to the beginning of the invalid sequence. + // + val.first += i1; + val.second -= i1; + + return r; + } + } + + if (i != 0) // We have a chunk. + { + r = {val.first, i}; + + val.first += i; + val.second -= i; + } + + return r; + }; + + // Value's original size (used to calculate the offset of the errant + // character in case of a validation failure). + // + const size_t vn (val.second); + + // Write the separator, if any. + // + if (sep.second != 0) + { + if (cap < sep.second && !grow (sep.second)) + goto fail_nospace; + + append (sep.first, sep.second); + } + + // Write the value's opening quote, if requested. + // + if (q != '\0') + { + if (cap == 0 && !grow (1)) + goto fail_nospace; + + append ("\"", 1); + } + + // Write the value, unless empty. + // + while (val.second != 0) + { + pair<const char*, size_t> ch (nullptr, 0); + + if (cap != 0) + ch = check ? chunk_checked () : chunk (); + + if (ch.first == nullptr) + { + // The minimum extra bytes we need the overflow function to be able + // to provide is based on these sequences that we do not break: + // + // - 4 bytes for a UTF-8 sequence + // - 6 bytes for an escaped Unicode sequence (\uXXXX). + // + if (!grow (6, ch.second)) + goto fail_nospace; + } + else if (ch.second != string::npos) + append (ch.first, ch.second); + else + goto fail_utf8; + } + + // Write the value's closing quote, if requested. + // + if (q != '\0') + { + if (cap == 0 && !grow (1)) + goto fail_nospace; + + append ("\"", 1); + } + + return; + + // Note: keep descriptions consistent with the parser. + // + fail_utf8: + throw invalid_json_output (e, + e == event::name ? error_code::invalid_name + : error_code::invalid_value, + "invalid UTF-8 text", + vn - val.second); + + fail_nospace: + throw invalid_json_output ( + e, error_code::buffer_overflow, "insufficient space in buffer"); + } + + size_t buffer_serializer:: + to_chars_impl (char* b, size_t n, const char* f, ...) + { + va_list a; + va_start (a, f); + const int r (vsnprintf (b, n, f, a)); + va_end (a); + + if (r < 0 || r >= static_cast<int> (n)) + { + throw invalid_json_output (event::number, + error_code::invalid_value, + "unable to convert number to string"); + } + + return static_cast<size_t> (r); + } + } +} diff --git a/libbutl/json/serializer.hxx b/libbutl/json/serializer.hxx new file mode 100644 index 0000000..5192cb4 --- /dev/null +++ b/libbutl/json/serializer.hxx @@ -0,0 +1,413 @@ +#pragma once + +#ifdef BUILD2_BOOTSTRAP +# error JSON serializer not available during bootstrap +#endif + +#include <array> +#include <iosfwd> +#include <string> +#include <vector> +#include <cstddef> // size_t, nullptr_t +#include <utility> // pair +#include <stdexcept> // invalid_argument +#include <type_traits> // enable_if, is_* + +#include <libbutl/optional.hxx> // butl::optional is std::optional or similar. + +#include <libbutl/json/event.hxx> + +#include <libbutl/export.hxx> + +namespace butl +{ + // Using the RFC8259 terminology: JSON (output) text, JSON value, object + // member. + // + namespace json + { + class invalid_json_output: public std::invalid_argument + { + public: + using event_type = json::event; + + enum class error_code + { + buffer_overflow, + unexpected_event, + invalid_name, + invalid_value + }; + + invalid_json_output (optional<event_type> event, + error_code code, + const char* description, + std::size_t offset = std::string::npos); + + invalid_json_output (optional<event_type> event, + error_code code, + const std::string& description, + std::size_t offset = std::string::npos); + + // Event that triggered the error. If the error is in the value, then + // offset points to the offending byte (for example, the beginning of an + // invalid UTF-8 byte sequence). Otherwise, offset is string::npos. + // + optional<event_type> event; + error_code code; + std::size_t offset; + }; + + // The serializer makes sure the resulting JSON is syntactically but not + // necessarily semantically correct. For example, it's possible to + // serialize a number event with non-numeric data. + // + // Note that unlike the parser, the serializer is always in the multi- + // value mode allowing the serialization of zero or more values. Note also + // that while values are separated with newlines, there is no trailing + // newline after the last (or only) value and the user is expected to add + // it manually if needed. + // + // Also note that while RFC8259 recommends object members to have unique + // names, the serializer does not enforce this. + // + class LIBBUTL_SYMEXPORT buffer_serializer + { + public: + // Serialize to string growing it as necessary. + // + // The indentation argument specifies the number of indentation spaces + // that should be used for pretty-printing. If 0 is passed, no + // pretty-printing is performed. + // + explicit + buffer_serializer (std::string&, std::size_t indentation = 2); + + // Serialize to vector of characters growing it as necessary. + // + explicit + buffer_serializer (std::vector<char>&, std::size_t indentation = 2); + + // Serialize to a fixed array. + // + // The length of the output text written is tracked in the size + // argument. + // + // If the array is not big enough to store the entire output text, the + // next() call that reaches the limit will throw invalid_json_output. + // + template <std::size_t N> + buffer_serializer (std::array<char, N>&, std::size_t& size, + std::size_t indentation = 2); + + // Serialize to a fixed buffer. + // + // The length of the output text written is tracked in the size + // argument. + // + // If the buffer is not big enough to store the entire output text, the + // next() call that reaches the limit will throw invalid_json_output. + // + buffer_serializer (void* buf, std::size_t& size, std::size_t capacity, + std::size_t indentation = 2); + + // The overflow function is called when the output buffer is out of + // space. The extra argument is a hint indicating the extra space likely + // to be required. + // + // Possible strategies include re-allocating a larger buffer or flushing + // the contents of the original buffer to the output destination. In + // case of a reallocation, the implementation is responsible for copying + // the contents of the original buffer over. + // + // The flush function is called when the complete JSON value has been + // serialized to the buffer. It can be used to write the contents of the + // buffer to the output destination. Note that flush is not called after + // the second absent (nullopt) event (or the only absent event; see + // next() for details). + // + // Both functions are passed the original buffer, its size (the amount + // of output text), and its capacity. They return (by modifying the + // argument) the replacement buffer and its size and capacity (these may + // refer to the original buffer). If space cannot be made available, the + // implementation can throw an appropriate exception (for example, + // std::bad_alloc or std::ios_base::failure). Any exceptions thrown is + // propagated to the user. + // + struct buffer + { + void* data; + std::size_t& size; + std::size_t capacity; + }; + + using overflow_function = void (void* data, + event, + buffer&, + std::size_t extra); + using flush_function = void (void* data, event, buffer&); + + // Serialize using a custom buffer and overflow/flush functions (both + // are optional). + // + buffer_serializer (void* buf, std::size_t capacity, + overflow_function*, + flush_function*, + void* data, + std::size_t indentation = 2); + + // As above but the length of the output text written is tracked in the + // size argument. + // + buffer_serializer (void* buf, std::size_t& size, std::size_t capacity, + overflow_function*, + flush_function*, + void* data, + std::size_t indentation = 2); + + // Begin/end an object. + // + // The member_begin_object() version is a shortcut for: + // + // member_name (name, check); + // begin_object (); + // + void + begin_object (); + + void + member_begin_object (const char*, bool check = true); + + void + member_begin_object (const std::string&, bool check = true); + + void + end_object (); + + // Serialize an object member (name and value). + // + // If check is false, then don't check whether the name (or value, if + // it's a string) is valid UTF-8 and don't escape any characters. + // + template <typename T> + void + member (const char* name, const T& value, bool check = true); + + template <typename T> + void + member (const std::string& name, const T& value, bool check = true); + + // Serialize an object member name. + // + // If check is false, then don't check whether the name is valid UTF-8 + // and don't escape any characters. + // + void + member_name (const char*, bool check = true); + + void + member_name (const std::string&, bool check = true); + + // Begin/end an array. + // + // The member_begin_array() version is a shortcut for: + // + // member_name (name, check); + // begin_array (); + // + void + begin_array (); + + void + member_begin_array (const char*, bool check = true); + + void + member_begin_array (const std::string&, bool check = true); + + void + end_array (); + + // Serialize a string. + // + // If check is false, then don't check whether the value is valid UTF-8 + // and don't escape any characters. + // + // Note that a NULL C-string pointer is serialized as a null value. + // + void + value (const char*, bool check = true); + + void + value (const std::string&, bool check = true); + + // Serialize a number. + // + template <typename T> + typename std::enable_if<std::is_integral<T>::value || + std::is_floating_point<T>::value>::type + value (T); + + // Serialize a boolean value. + // + void + value (bool); + + // Serialize a null value. + // + void + value (std::nullptr_t); + + // Serialize value as a pre-serialized JSON value. + // + // Note that the value is expected to be a valid (and suitable) UTF-8- + // encoded JSON text. Note also that if pretty-printing is enabled, + // the resulting output may not be correctly indented. + // + void + value_json_text (const char*); + + void + value_json_text (const std::string&); + + // Serialize next JSON event. + // + // If check is false, then don't check whether the value is valid UTF-8 + // and don't escape any characters. + // + // Return true if more events are required to complete the (top-level) + // value (that is, it is currently incomplete) and false otherwise. + // Throw invalid_json_output exception in case of an invalid event or + // value. + // + // At the end of the value an optional absent (nullopt) event can be + // serialized to verify the value is complete. If it is incomplete an + // invalid_json_output exception is thrown. An optional followup absent + // event can be serialized to indicate the completion of a multi-value + // sequence (one and only absent event indicates a zero value sequence). + // If anything is serialized to a complete value sequence an + // invalid_json_output exception is thrown. + // + // Note that this function was designed to be easily invoked with the + // output from parser::next() and parser::data(). For example, for a + // single-value mode: + // + // optional<event> e; + // do + // { + // e = p.next (); + // s.next (e, p.data ()); + // } + // while (e); + // + // For a multi-value mode: + // + // while (p.peek ()) + // { + // optional<event> e; + // do + // { + // e = p.next (); + // s.next (e, p.data ()); + // } + // while (e); + // } + // s.next (nullopt); // End of value sequence. + // + bool + next (optional<event> event, + std::pair<const char*, std::size_t> value = {}, + bool check = true); + + private: + void + write (event, + std::pair<const char*, std::size_t> sep, + std::pair<const char*, std::size_t> val, + bool check, char quote = '\0'); + + // Forward a value(v, check) call to value(v) ignoring the check + // argument. Used in the member() implementation. + // + template <typename T> + void + value (const T& v, bool /*check*/) + { + value (v); + } + + // Convert numbers to string. + // + static std::size_t to_chars (char*, std::size_t, int); + static std::size_t to_chars (char*, std::size_t, long); + static std::size_t to_chars (char*, std::size_t, long long); + static std::size_t to_chars (char*, std::size_t, unsigned int); + static std::size_t to_chars (char*, std::size_t, unsigned long); + static std::size_t to_chars (char*, std::size_t, unsigned long long); + static std::size_t to_chars (char*, std::size_t, double); + static std::size_t to_chars (char*, std::size_t, long double); + + static std::size_t to_chars_impl (char*, size_t, const char* fmt, ...); + + buffer buf_; + std::size_t size_; + overflow_function* overflow_; + flush_function* flush_; + void* data_; + + // State of a "structured type" (array or object; as per the RFC + // terminology). + // + struct state + { + const event type; // Type kind (begin_array or begin_object). + std::size_t count; // Number of events serialized inside this type. + }; + + // Stack of nested structured type states. + // + // @@ TODO: would have been nice to use small_vector. + // + std::vector<state> state_; + + // The number of consecutive absent events (nullopt) serialized thus + // far. + // + // Note: initialized to 1 to naturally handle a single absent event + // (declares an empty value sequence complete). + // + std::size_t absent_ = 1; + + // The number of spaces with which to indent (once for each level of + // nesting). If zero, pretty-printing is disabled. + // + std::size_t indent_; + + // Separator and indentation before/after value inside an object or + // array (see pretty-printing implementation for details). + // + std::string sep_; + + // The number of complete top-level values serialized thus far. + // + std::size_t values_ = 0; + }; + + class LIBBUTL_SYMEXPORT stream_serializer: public buffer_serializer + { + public: + // Serialize to std::ostream. + // + // If stream exceptions are enabled then the std::ios_base::failure + // exception is used to report input/output errors (badbit and failbit). + // Otherwise, those are reported as the invalid_json_output exception. + // + explicit + stream_serializer (std::ostream&, std::size_t indentation = 2); + + protected: + char tmp_[4096]; + }; + } +} + +#include <libbutl/json/serializer.ixx> diff --git a/libbutl/json/serializer.ixx b/libbutl/json/serializer.ixx new file mode 100644 index 0000000..a719ef6 --- /dev/null +++ b/libbutl/json/serializer.ixx @@ -0,0 +1,247 @@ +#include <cstring> // strlen() + +namespace butl +{ + namespace json + { + inline invalid_json_output:: + invalid_json_output (optional<event_type> e, + error_code c, + const char* d, + std::size_t o) + : std::invalid_argument (d), event (e), code (c), offset (o) + { + } + + inline invalid_json_output:: + invalid_json_output (optional<event_type> e, + error_code c, + const std::string& d, + std::size_t o) + : invalid_json_output (e, c, d.c_str (), o) + { + } + + inline buffer_serializer:: + buffer_serializer (void* b, std::size_t& s, std::size_t c, + overflow_function* o, flush_function* f, void* d, + std::size_t i) + : buf_ {b, s, c}, + overflow_ (o), + flush_ (f), + data_ (d), + indent_ (i), + sep_ (indent_ != 0 ? ",\n" : "") + { + } + + template <std::size_t N> + inline buffer_serializer:: + buffer_serializer (std::array<char, N>& a, std::size_t& s, std::size_t i) + : buffer_serializer (a.data (), s, a.size (), + nullptr, nullptr, nullptr, + i) + { + } + + inline buffer_serializer:: + buffer_serializer (void* b, std::size_t& s, std::size_t c, std::size_t i) + : buffer_serializer (b, s, c, nullptr, nullptr, nullptr, i) + { + } + + inline buffer_serializer:: + buffer_serializer (void* b, std::size_t c, + overflow_function* o, flush_function* f, void* d, + std::size_t i) + : buffer_serializer (b, size_, c, o, f, d, i) + { + size_ = 0; + } + + inline void buffer_serializer:: + begin_object () + { + next (event::begin_object); + } + + inline void buffer_serializer:: + end_object () + { + next (event::end_object); + } + + inline void buffer_serializer:: + member_name (const char* n, bool c) + { + next (event::name, {n, n != nullptr ? std::strlen (n) : 0}, c); + } + + inline void buffer_serializer:: + member_name (const std::string& n, bool c) + { + next (event::name, {n.c_str (), n.size ()}, c); + } + + inline void buffer_serializer:: + member_begin_object (const char* n, bool c) + { + member_name (n, c); + begin_object (); + } + + inline void buffer_serializer:: + member_begin_object (const std::string& n, bool c) + { + member_name (n, c); + begin_object (); + } + + template <typename T> + inline void buffer_serializer:: + member (const char* n, const T& v, bool c) + { + member_name (n, c); + value (v, c); + } + + template <typename T> + inline void buffer_serializer:: + member (const std::string& n, const T& v, bool c) + { + member_name (n, c); + value (v, c); + } + + inline void buffer_serializer:: + begin_array () + { + next (event::begin_array); + } + + inline void buffer_serializer:: + member_begin_array (const char* n, bool c) + { + member_name (n, c); + begin_array (); + } + + inline void buffer_serializer:: + member_begin_array (const std::string& n, bool c) + { + member_name (n, c); + begin_array (); + } + + inline void buffer_serializer:: + end_array () + { + next (event::end_array); + } + + inline void buffer_serializer:: + value (const char* v, bool c) + { + if (v != nullptr) + next (event::string, {v, std::strlen (v)}, c); + else + next (event::null); + } + + inline void buffer_serializer:: + value (const std::string& v, bool c) + { + next (event::string, {v.c_str (), v.size ()}, c); + } + + template <typename T> + typename std::enable_if<std::is_integral<T>::value || + std::is_floating_point<T>::value>::type + buffer_serializer:: + value (T v) + { + // The largest 128-bit integer has 39 digits, and long floating point + // numbers will fit because they are output in scientific notation. + // + char b[40]; + const std::size_t n (to_chars (b, sizeof (b), v)); + next (event::number, {b, n}); + } + + inline void buffer_serializer:: + value (bool b) + { + next (event::boolean, + b ? std::make_pair ("true", 4) : std::make_pair ("false", 5)); + } + + inline void buffer_serializer:: + value (std::nullptr_t) + { + next (event::null); + } + + inline void buffer_serializer:: + value_json_text (const char* v) + { + // Use event::number (which doesn't involve any quoting) with a disabled + // check. + // + next (event::number, {v, std::strlen (v)}, false /* check */); + } + + inline void buffer_serializer:: + value_json_text (const std::string& v) + { + next (event::number, {v.c_str (), v.size ()}, false /* check */); + } + + inline size_t buffer_serializer:: + to_chars (char* b, size_t s, int v) + { + return to_chars_impl (b, s, "%d", v); + } + + inline size_t buffer_serializer:: + to_chars (char* b, size_t s, long v) + { + return to_chars_impl (b, s, "%ld", v); + } + + inline size_t buffer_serializer:: + to_chars (char* b, size_t s, long long v) + { + return to_chars_impl (b, s, "%lld", v); + } + + inline size_t buffer_serializer:: + to_chars (char* b, size_t s, unsigned v) + { + return to_chars_impl (b, s, "%u", v); + } + + inline size_t buffer_serializer:: + to_chars (char* b, size_t s, unsigned long v) + { + return to_chars_impl (b, s, "%lu", v); + } + + inline size_t buffer_serializer:: + to_chars (char* b, size_t s, unsigned long long v) + { + return to_chars_impl (b, s, "%llu", v); + } + + inline size_t buffer_serializer:: + to_chars (char* b, size_t s, double v) + { + return to_chars_impl (b, s, "%.10g", v); + } + + inline size_t buffer_serializer:: + to_chars (char* b, size_t s, long double v) + { + return to_chars_impl (b, s, "%.10Lg", v); + } + } +} |