aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBoris Kolpackov <boris@codesynthesis.com>2022-09-30 13:31:25 +0200
committerBoris Kolpackov <boris@codesynthesis.com>2022-09-30 13:31:25 +0200
commitd9dd84487bda8303590d5b30987f1d76b93867ba (patch)
tree7fd0a926dea644a2ddb70c863574af386b9d7f79
parente53da6ae4665ce49dddcc9aaa97a4e87bb94f48d (diff)
Add JSON parser (copy of libstud-json)
-rw-r--r--LICENSE4
-rw-r--r--libbutl/buildfile4
-rw-r--r--libbutl/json/parser.cxx502
-rw-r--r--libbutl/json/parser.hxx408
-rw-r--r--libbutl/json/parser.ixx216
-rw-r--r--libbutl/json/pdjson.c1020
-rw-r--r--libbutl/json/pdjson.h147
7 files changed, 2299 insertions, 2 deletions
diff --git a/LICENSE b/LICENSE
index de2b5a9..f22e52f 100644
--- a/LICENSE
+++ b/LICENSE
@@ -8,6 +8,10 @@ libbutl/mingw-*.hxx:
2-clause BSD License; see the file headers for details.
+libbutl/json/pdjson.[hc]:
+
+UNLICENSE (dedicated to the public domain).
+
The rest:
MIT License
diff --git a/libbutl/buildfile b/libbutl/buildfile
index 6c490af..ba4ad96 100644
--- a/libbutl/buildfile
+++ b/libbutl/buildfile
@@ -29,9 +29,9 @@ lib{butl}: {hxx ixx cxx}{win32-utility}: include = $windows
lib{butl}: hxx{mingw-*}: include = $mingw_stdthread
# Our C-files are always included into C++-files that wrap the corresponding
-# API so treat them as files exclude from the compilation.
+# API so treat them as files to exclude from the compilation.
#
-lib{butl}: file{*.c *.h}
+lib{butl}: file{**.c **.h}
# Platform-specific UUID implementations.
#
diff --git a/libbutl/json/parser.cxx b/libbutl/json/parser.cxx
new file mode 100644
index 0000000..fa8916b
--- /dev/null
+++ b/libbutl/json/parser.cxx
@@ -0,0 +1,502 @@
+#define PDJSON_SYMEXPORT static // See below.
+
+#include <libbutl/json/parser.hxx>
+
+#include <istream>
+
+// There is an issue (segfault) with using std::current_exception() and
+// std::rethrow_exception() with older versions of libc++ on Linux. While the
+// exact root cause hasn't been determined, the suspicion is that something
+// gets messed up if we "smuggle" std::exception_ptr through extern "C" call
+// frames (we cannot even destroy such an exception without a segfault). We
+// also could not determine in which version exactly this has been fixed but
+// we know that libc++ 6.0.0 doesn't appear to have this issue (though we are
+// not entirely sure the issue is (only) in libc++; libgcc_s could also be
+// involved).
+//
+// The workaround is to just catch (and note) the exception and then throw a
+// new instance of generic std::istream::failure. In order not to drag the
+// below test into the header, we wrap exception_ptr with optional<> and use
+// NULL to indicate the presence of the exception when the workaround is
+// required.
+//
+// Note that if/when we drop this workaround, we should also get rid of
+// optional<> in stream::exception member.
+//
+#undef LIBBUTL_JSON_NO_EXCEPTION_PTR
+
+#if defined (__linux__) && defined(__clang__)
+# if __has_include(<__config>)
+# include <__config> // _LIBCPP_VERSION
+# if _LIBCPP_VERSION < 6000
+# define LIBBUTL_JSON_NO_EXCEPTION_PTR 1
+# endif
+# endif
+#endif
+
+namespace butl
+{
+ namespace json
+ {
+ using namespace std;
+
+ parser::
+ ~parser ()
+ {
+ json_close (impl_);
+ }
+
+ static int
+ stream_get (void* x)
+ {
+ auto& s (*static_cast<parser::stream*> (x));
+
+ // In the multi-value mode reading of whitespaces/separators is split
+ // between our code and pdjson's. As a result, these functions may end
+ // up being called more than once after EOF is reached. Which is
+ // something iostream does not handle gracefully.
+ //
+ if (!s.is->eof ())
+ {
+ try
+ {
+ // We first peek not to trip failbit on EOF.
+ //
+ if (s.is->peek () != istream::traits_type::eof ())
+ return static_cast<char> (s.is->get ());
+ }
+ catch (...)
+ {
+#ifndef LIBBUTL_JSON_NO_EXCEPTION_PTR
+ s.exception = current_exception ();
+#else
+ s.exception = nullptr;
+#endif
+ }
+ }
+
+ return EOF;
+ }
+
+ static int
+ stream_peek (void* x)
+ {
+ auto& s (*static_cast<parser::stream*> (x));
+
+ if (!s.is->eof ())
+ {
+ try
+ {
+ auto c (s.is->peek ());
+ if (c != istream::traits_type::eof ())
+ return static_cast<char> (c);
+ }
+ catch (...)
+ {
+#ifndef LIBBUTL_JSON_NO_EXCEPTION_PTR
+ s.exception = current_exception ();
+#else
+ s.exception = nullptr;
+#endif
+ }
+ }
+
+ return EOF;
+ }
+
+ // NOTE: watch out for exception safety (specifically, doing anything that
+ // might throw after opening the stream).
+ //
+ parser::
+ parser (istream& is, const char* n, bool mv, const char* sep) noexcept
+ : input_name (n),
+ stream_ {&is, nullopt},
+ multi_value_ (mv),
+ separators_ (sep),
+ raw_s_ (nullptr),
+ raw_n_ (0)
+ {
+ json_open_user (impl_, &stream_get, &stream_peek, &stream_);
+ json_set_streaming (impl_, multi_value_);
+ }
+
+ parser::
+ parser (const void* t,
+ size_t s,
+ const char* n,
+ bool mv,
+ const char* sep) noexcept
+ : input_name (n),
+ stream_ {nullptr, nullopt},
+ multi_value_ (mv),
+ separators_ (sep),
+ raw_s_ (nullptr),
+ raw_n_ (0)
+ {
+ json_open_buffer (impl_, t, s);
+ json_set_streaming (impl_, multi_value_);
+ }
+
+ optional<event> parser::
+ next ()
+ {
+ name_p_ = value_p_ = location_p_ = false;
+
+ // Note that for now we don't worry about the state of the parser if
+ // next_impl() throws assuming it is not going to be reused.
+ //
+ if (peeked_)
+ {
+ parsed_ = peeked_;
+ peeked_ = nullopt;
+ }
+ else
+ parsed_ = next_impl ();
+
+ return translate (*parsed_);
+ }
+
+ optional<event> parser::
+ peek ()
+ {
+ if (!peeked_)
+ {
+ if (parsed_)
+ {
+ cache_parsed_data ();
+ cache_parsed_location ();
+ }
+ peeked_ = next_impl ();
+ }
+ return translate (*peeked_);
+ }
+
+ std::uint64_t parser::
+ line () const noexcept
+ {
+ if (!location_p_)
+ {
+ if (!parsed_)
+ return 0;
+
+ assert (!peeked_);
+
+ return static_cast<uint64_t> (
+ json_get_lineno (const_cast<json_stream*> (impl_)));
+ }
+
+ return line_;
+ }
+
+ std::uint64_t parser::
+ column () const noexcept
+ {
+ if (!location_p_)
+ {
+ if (!parsed_)
+ return 0;
+
+ assert (!peeked_);
+
+ return static_cast<uint64_t> (
+ json_get_column (const_cast<json_stream*> (impl_)));
+ }
+
+ return column_;
+ }
+
+ std::uint64_t parser::
+ position () const noexcept
+ {
+ if (!location_p_)
+ {
+ if (!parsed_)
+ return 0;
+
+ assert (!peeked_);
+
+ return static_cast<uint64_t> (
+ json_get_position (const_cast<json_stream*> (impl_)));
+ }
+
+ return position_;
+ }
+
+ json_type parser::
+ next_impl ()
+ {
+ raw_s_ = nullptr;
+ raw_n_ = 0;
+ json_type e;
+
+ // Read characters between values skipping required separators and JSON
+ // whitespaces. Return whether a required separator was encountered as
+ // well as the first non-separator/whitespace character (which, if EOF,
+ // should trigger a check for input/output errors).
+ //
+ // Note that the returned non-separator will not have been extracted
+ // from the input (so position, column, etc. will still refer to its
+ // predecessor).
+ //
+ auto skip_separators = [this] () -> pair<bool, int>
+ {
+ bool r (separators_ == nullptr);
+
+ int c;
+ for (; (c = json_source_peek (impl_)) != EOF; json_source_get (impl_))
+ {
+ // User separator.
+ //
+ if (separators_ != nullptr && *separators_ != '\0')
+ {
+ if (strchr (separators_, c) != nullptr)
+ {
+ r = true;
+ continue;
+ }
+ }
+
+ // JSON separator.
+ //
+ if (json_isspace (c))
+ {
+ if (separators_ != nullptr && *separators_ == '\0')
+ r = true;
+
+ continue;
+ }
+
+ break;
+ }
+
+ return make_pair (r, c);
+ };
+
+ // In the multi-value mode skip any instances of required separators
+ // (and any other JSON whitespace) preceding the first JSON value.
+ //
+ if (multi_value_ && !parsed_ && !peeked_)
+ {
+ if (skip_separators ().second == EOF && stream_.is != nullptr)
+ {
+ if (stream_.exception) goto fail_rethrow;
+ if (stream_.is->fail ()) goto fail_stream;
+ }
+ }
+
+ e = json_next (impl_);
+
+ // First check for a pending input/output error.
+ //
+ if (stream_.is != nullptr)
+ {
+ if (stream_.exception) goto fail_rethrow;
+ if (stream_.is->fail ()) goto fail_stream;
+ }
+
+ // There are two ways to view separation between two values: as following
+ // the first value or as preceding the second value. And one aspect that
+ // is determined by this is whether a separation violation is a problem
+ // with the first value or with the second, which becomes important if
+ // the user bails out before parsing the second value.
+ //
+ // Consider these two unseparated value (yes, in JSON they are two
+ // values, leading zeros are not allowed in JSON numbers):
+ //
+ // 01
+ //
+ // If the user bails out after parsing 0 in a stream that should have
+ // been newline-delimited, they most likely would want to get an error
+ // since this is most definitely an invalid value rather than two
+ // values that are not properly separated. So in this light we handle
+ // separators at the end of the first value.
+ //
+ switch (e)
+ {
+ case JSON_DONE:
+ {
+ // Deal with the following value separators.
+ //
+ // Note that we must not do this for the second JSON_DONE (or the
+ // first one in case there are no values) that signals the end of
+ // input.
+ //
+ if (multi_value_ &&
+ (parsed_ || peeked_) &&
+ (peeked_ ? *peeked_ : *parsed_) != JSON_DONE)
+ {
+ auto p (skip_separators ());
+
+ if (p.second == EOF && stream_.is != nullptr)
+ {
+ if (stream_.exception) goto fail_rethrow;
+ if (stream_.is->fail ()) goto fail_stream;
+ }
+
+ // Note that we don't require separators after the last value.
+ //
+ if (!p.first && p.second != EOF)
+ {
+ json_source_get (impl_); // Consume to update column number.
+ goto fail_separation;
+ }
+
+ json_reset (impl_);
+ }
+ break;
+ }
+ case JSON_ERROR: goto fail_json;
+ case JSON_STRING:
+ case JSON_NUMBER:
+ raw_s_ = json_get_string (impl_, &raw_n_);
+ raw_n_--; // Includes terminating `\0`.
+ break;
+ case JSON_TRUE: raw_s_ = "true"; raw_n_ = 4; break;
+ case JSON_FALSE: raw_s_ = "false"; raw_n_ = 5; break;
+ case JSON_NULL: raw_s_ = "null"; raw_n_ = 4; break;
+ default: break;
+ }
+
+ return e;
+
+ fail_json:
+ throw invalid_json_input (
+ input_name != nullptr ? input_name : "",
+ static_cast<uint64_t> (json_get_lineno (impl_)),
+ static_cast<uint64_t> (json_get_column (impl_)),
+ static_cast<uint64_t> (json_get_position (impl_)),
+ json_get_error (impl_));
+
+ fail_separation:
+ throw invalid_json_input (
+ input_name != nullptr ? input_name : "",
+ static_cast<uint64_t> (json_get_lineno (impl_)),
+ static_cast<uint64_t> (json_get_column (impl_)),
+ static_cast<uint64_t> (json_get_position (impl_)),
+ "missing separator between JSON values");
+
+ fail_stream:
+ throw invalid_json_input (
+ input_name != nullptr ? input_name : "",
+ static_cast<uint64_t> (json_get_lineno (impl_)),
+ static_cast<uint64_t> (json_get_column (impl_)),
+ static_cast<uint64_t> (json_get_position (impl_)),
+ "unable to read JSON input text");
+
+ fail_rethrow:
+#ifndef LIBBUTL_JSON_NO_EXCEPTION_PTR
+ rethrow_exception (move (*stream_.exception));
+#else
+ throw istream::failure ("unable to read");
+#endif
+ }
+
+ optional<event> parser::
+ translate (json_type e) const noexcept
+ {
+ switch (e)
+ {
+ case JSON_DONE: return nullopt;
+ case JSON_OBJECT: return event::begin_object;
+ case JSON_OBJECT_END: return event::end_object;
+ case JSON_ARRAY: return event::begin_array;
+ case JSON_ARRAY_END: return event::end_array;
+ case JSON_STRING:
+ {
+ // This can be a value or, inside an object, a name from the
+ // name/value pair.
+ //
+ size_t n;
+ return json_get_context (const_cast<json_stream*> (impl_), &n) ==
+ JSON_OBJECT &&
+ n % 2 == 1
+ ? event::name
+ : event::string;
+ }
+ case JSON_NUMBER: return event::number;
+ case JSON_TRUE: return event::boolean;
+ case JSON_FALSE: return event::boolean;
+ case JSON_NULL: return event::null;
+ case JSON_ERROR: assert (false); // Should've been handled by caller.
+ }
+
+ return nullopt; // Should never reach.
+ }
+
+ void parser::
+ cache_parsed_data ()
+ {
+ name_p_ = value_p_ = false;
+ if (const optional<event> e = translate (*parsed_))
+ {
+ if (e == event::name)
+ {
+ name_.assign (raw_s_, raw_n_);
+ name_p_ = true;
+ }
+ else if (value_event (e))
+ {
+ value_.assign (raw_s_, raw_n_);
+ value_p_ = true;
+ }
+ }
+ }
+
+ void parser::
+ cache_parsed_location () noexcept
+ {
+ line_ = static_cast<uint64_t> (json_get_lineno (impl_));
+ column_ = static_cast<uint64_t> (json_get_column (impl_));
+ position_ = static_cast<uint64_t> (json_get_position (impl_));
+ location_p_ = true;
+ }
+
+ bool parser::
+ value_event (optional<event> e) noexcept
+ {
+ if (!e)
+ return false;
+
+ switch (*e)
+ {
+ case event::string:
+ case event::number:
+ case event::boolean:
+ case event::null:
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ [[noreturn]] void parser::
+ throw_invalid_value (const char* type, const char* v, size_t n) const
+ {
+ string d (string ("invalid ") + type + " value: '");
+ d.append (v, n);
+ d += '\'';
+
+ throw invalid_json_input (input_name != nullptr ? input_name : "",
+ line (),
+ column (),
+ position (),
+ move (d));
+ }
+ } // namespace json
+} // namespace butl
+
+// Include the implementation into our translation unit (instead of compiling
+// it separately) to (hopefully) get function inlining without LTO.
+//
+// Let's keep it last since the implementation defines a couple of macros.
+//
+#if defined(__clang__) || defined(__GNUC__)
+# pragma GCC diagnostic ignored "-Wunused-function"
+#endif
+
+extern "C"
+{
+#define PDJSON_STACK_INC 16
+#define PDJSON_STACK_MAX 2048
+#include "pdjson.c"
+}
diff --git a/libbutl/json/parser.hxx b/libbutl/json/parser.hxx
new file mode 100644
index 0000000..241ca46
--- /dev/null
+++ b/libbutl/json/parser.hxx
@@ -0,0 +1,408 @@
+#pragma once
+
+#ifdef BUILD2_BOOTSTRAP
+# error JSON parser not available during bootstrap
+#endif
+
+#include <iosfwd>
+#include <string>
+#include <cstddef> // size_t
+#include <cstdint> // uint64_t
+#include <utility> // pair
+#include <exception> // exception_ptr
+#include <stdexcept> // invalid_argument
+
+#include <libbutl/optional.hxx> // butl::optional is std::optional or similar.
+
+#include <libbutl/json/event.hxx>
+
+#include <libbutl/json/pdjson.h> // Implementation details.
+
+#include <libbutl/export.hxx>
+
+namespace butl
+{
+ // Using the RFC8259 terminology: JSON (input) text, JSON value, object
+ // member.
+ //
+ namespace json
+ {
+ class invalid_json_input: public std::invalid_argument
+ {
+ public:
+ std::string name;
+ std::uint64_t line;
+ std::uint64_t column;
+ std::uint64_t position;
+
+ invalid_json_input (std::string name,
+ std::uint64_t line,
+ std::uint64_t column,
+ std::uint64_t position,
+ const std::string& description);
+
+ invalid_json_input (std::string name,
+ std::uint64_t line,
+ std::uint64_t column,
+ std::uint64_t position,
+ const char* description);
+ };
+
+ class LIBBUTL_SYMEXPORT parser
+ {
+ public:
+ const char* input_name;
+
+ // Construction.
+ //
+
+ // Parse JSON input text from std::istream.
+ //
+ // The name argument is used to identify the input being parsed. Note
+ // that the stream, name, and separators are kept as references so they
+ // must outlive the parser instance.
+ //
+ // If stream exceptions are enabled then the std::ios_base::failure
+ // exception is used to report input/output errors (badbit and failbit).
+ // Otherwise, those are reported as the invalid_json_input exception.
+ //
+ // If multi_value is true, enable the multi-value mode in which case the
+ // input stream may contain multiple JSON values (more precisely, zero
+ // or more). If false (the default), parsing will fail unless there is
+ // exactly one JSON value in the input stream.
+ //
+ // If multi_value is true, the separators argument specifies the
+ // required separator characters between JSON values. At least one of
+ // them must be present between every pair of JSON values (in addition
+ // to any number of JSON whitespaces). No separators are required after
+ // the last JSON value (but any found will be skipped).
+ //
+ // Specifically, if it is NULL, then no separation is required (that is,
+ // both `{...}{...}` and `{...} {...}` would be valid). If it is empty,
+ // then at least one JSON whitespace is required. And if it is non-
+ // empty, then at least one of its characters must be present (for
+ // example, "\n\t" would require at least one newline or TAB character
+ // between JSON values).
+ //
+ // Note that a separator need not be valid JSON whitespace: any
+ // character is acceptable (though it probably shouldn't be an object,
+ // array, or string delimiter and should not occur within a non-self-
+ // delimited top-level value, such as `true`, `false`, `null`, or a
+ // number). All instances of required separators before and after a
+ // value are skipped. Therefore JSON Text Sequences (RFC 7464; AKA
+ // Record Separator-delimited JSON), which requires the RS (0x1E)
+ // character before each value, can be handled as well.
+ //
+ parser (std::istream&,
+ const std::string& name,
+ bool multi_value = false,
+ const char* separators = nullptr) noexcept;
+
+ parser (std::istream&,
+ const char* name,
+ bool multi_value = false,
+ const char* separators = nullptr) noexcept;
+
+ parser (std::istream&,
+ std::string&&,
+ bool = false,
+ const char* = nullptr) = delete;
+
+ // Parse a memory buffer that contains the entire JSON input text.
+ //
+ // The name argument is used to identify the input being parsed. Note
+ // that the buffer, name, and separators are kept as references so they
+ // must outlive the parser instance.
+ //
+ parser (const void* text,
+ std::size_t size,
+ const std::string& name,
+ bool multi_value = false,
+ const char* separators = nullptr) noexcept;
+
+ parser (const void* text,
+ std::size_t size,
+ const char* name,
+ bool multi_value = false,
+ const char* separators = nullptr) noexcept;
+
+ parser (const void*,
+ std::size_t,
+ std::string&&,
+ bool = false,
+ const char* = nullptr) = delete;
+
+ // Similar to the above but parse a string.
+ //
+ parser (const std::string& text,
+ const std::string& name,
+ bool multi_value = false,
+ const char* separators = nullptr) noexcept;
+
+ parser (const std::string& text,
+ const char* name,
+ bool multi_value = false,
+ const char* separators = nullptr) noexcept;
+
+ parser (const std::string&,
+ std::string&&,
+ bool = false,
+ const char* = nullptr) = delete;
+
+ // Similar to the above but parse a C-string.
+ //
+ parser (const char* text,
+ const std::string& name,
+ bool multi_value = false,
+ const char* separators = nullptr) noexcept;
+
+ parser (const char* text,
+ const char* name,
+ bool multi_value = false,
+ const char* separators = nullptr) noexcept;
+
+ parser (const char*,
+ std::string&&,
+ bool = false,
+ const char* = nullptr) = delete;
+
+ parser (parser&&) = delete;
+ parser (const parser&) = delete;
+
+ parser& operator= (parser&&) = delete;
+ parser& operator= (const parser&) = delete;
+
+ // Return the next event or nullopt if end of input is reached.
+ //
+ // In the single-value parsing mode (default) the parsing code could
+ // look like this:
+ //
+ // while (optional<event> e = p.next ())
+ // {
+ // switch (*e)
+ // {
+ // // ...
+ // }
+ // }
+ //
+ // In the multi-value mode the parser additionally returns nullopt after
+ // every JSON value parsed (so there will be two nullopt's after the
+ // last JSON value, the second indicating the end of input).
+ //
+ // One way to perform multi-value parsing is with the help of the peek()
+ // function (see below):
+ //
+ // while (p.peek ())
+ // {
+ // while (optional<event> e = p.next ())
+ // {
+ // switch (*e)
+ // {
+ // //...
+ // }
+ // }
+ // }
+ //
+ // Note that while the single-value mode will always parse exactly one
+ // value, the multi-value mode will accept zero values in which case a
+ // single nullopt is returned.
+ //
+ optional<event>
+ next ();
+
+ // The range-based for loop support.
+ //
+ // In the single-value parsing mode (default) the parsing code could
+ // look like this:
+ //
+ // for (event e: p)
+ // {
+ // switch (e)
+ // {
+ // //...
+ // }
+ // }
+ //
+ // And in the multi-value mode (see next() for more information) like
+ // this:
+ //
+ // while (p.peek ())
+ // {
+ // for (event e: p)
+ // {
+ // switch (e)
+ // {
+ // //...
+ // }
+ // }
+ // }
+ //
+ // Note that generally, the iterator interface doesn't make much sense
+ // for the parser so for now we have an implementation that is just
+ // enough for the range-based for.
+ //
+ struct iterator;
+
+ iterator begin () {return iterator (this, next ());}
+ iterator end () {return iterator (nullptr, nullopt);}
+
+ // Return the next event without considering it parsed. In other words,
+ // after this call, any subsequent calls to peek() and the next call to
+ // next() (if any) will all return the same event.
+ //
+ // Note that the name, value, and line corresponding to the peeked event
+ // are not accessible with name(), value() and line(); these functions
+ // will still return values corresponding to the most recent call to
+ // next(). The peeked values, however, can be accessed in the raw form
+ // using data().
+ //
+ optional<event>
+ peek ();
+
+ // Event data.
+ //
+
+ // Return the object member name.
+ //
+ const std::string&
+ name ();
+
+ // Any value (string, number, boolean, and null) can be retrieved as a
+ // string. Calling this function after any non-value events is illegal.
+ //
+ // Note that the value is returned as a non-const string reference and
+ // you are allowed to move the value out of it. However, this should not
+ // be done unnecessarily or in cases where the small string optimization
+ // is likely since the string's buffer is reused to store subsequent
+ // values.
+ //
+ std::string&
+ value ();
+
+ // Convert the value to an integer, floating point, or bool. Throw
+ // invalid_json_input if the conversion is impossible without a loss.
+ //
+ template <typename T>
+ T
+ value () const;
+
+ // Return the value or object member name in the raw form.
+ //
+ // Calling this function on non-value/name events is legal in which case
+ // NULL is returned. Note also that the returned data corresponds to the
+ // most recent event, whether peeked or parsed.
+ //
+ std::pair<const char*, std::size_t>
+ data () const {return std::make_pair (raw_s_, raw_n_);}
+
+ // Return the line number (1-based) corresponding to the most recently
+ // parsed event or 0 if nothing has been parsed yet.
+ //
+ std::uint64_t
+ line () const noexcept;
+
+ // Return the column number (1-based) corresponding to the beginning of
+ // the most recently parsed event or 0 if nothing has been parsed yet.
+ //
+ std::uint64_t
+ column () const noexcept;
+
+ // Return the position (byte offset) pointing immediately after the most
+ // recently parsed event or 0 if nothing has been parsed yet.
+ //
+ std::uint64_t
+ position () const noexcept;
+
+ // Implementation details.
+ //
+ public:
+ struct iterator
+ {
+ using value_type = event;
+
+ explicit
+ iterator (parser* p = nullptr, optional<event> e = nullopt)
+ : p_ (p), e_ (e) {}
+
+ event operator* () const {return *e_;}
+ iterator& operator++ () {e_ = p_->next (); return *this;}
+
+ // Comparison only makes sense when comparing to end (eof).
+ //
+ bool operator== (iterator y) const {return !e_ && !y.e_;}
+ bool operator!= (iterator y) const {return !(*this == y);}
+
+ private:
+ parser* p_;
+ optional<event> e_;
+ };
+
+ struct stream
+ {
+ std::istream* is;
+ optional<std::exception_ptr> exception;
+ };
+
+ [[noreturn]] void
+ throw_invalid_value (const char* type, const char*, std::size_t) const;
+
+ ~parser ();
+
+ private:
+ // Functionality shared by next() and peek().
+ //
+ json_type
+ next_impl ();
+
+ // Translate the event produced by the most recent call to next_impl().
+ //
+ // Note that the underlying parser state determines whether name or
+ // value is returned when translating JSON_STRING.
+ //
+ optional<event>
+ translate (json_type) const noexcept;
+
+ // Cache state (name/value) produced by the most recent call to
+ // next_impl().
+ //
+ void
+ cache_parsed_data ();
+
+ // Cache the location numbers as determined by the most recent call to
+ // next_impl().
+ //
+ void
+ cache_parsed_location () noexcept;
+
+ // Return true if this is a value event (string, number, boolean, or
+ // null).
+ //
+ static bool
+ value_event (optional<event>) noexcept;
+
+ stream stream_;
+
+ bool multi_value_;
+ const char* separators_;
+
+ // The *_p_ members indicate whether the value is present (cached).
+ // Note: not using optional not to reallocate the string's buffer.
+ //
+ std::string name_; bool name_p_ = false;
+ std::string value_; bool value_p_ = false;
+ std::uint64_t line_, column_, position_; bool location_p_ = false;
+
+ optional<json_type> parsed_; // Current parsed event if any.
+ optional<json_type> peeked_; // Current peeked event if any.
+
+ ::json_stream impl_[1];
+
+ // Cached raw value.
+ //
+ const char* raw_s_;
+ std::size_t raw_n_;
+ };
+ }
+}
+
+#include <libbutl/json/parser.ixx>
diff --git a/libbutl/json/parser.ixx b/libbutl/json/parser.ixx
new file mode 100644
index 0000000..3f02a1e
--- /dev/null
+++ b/libbutl/json/parser.ixx
@@ -0,0 +1,216 @@
+#include <cerrno>
+#include <limits> // numeric_limits
+#include <utility> // move()
+#include <cassert>
+#include <cstdlib> // strto*()
+#include <type_traits> // enable_if, is_*
+#include <cstring> // strlen()
+
+namespace butl
+{
+ namespace json
+ {
+ inline invalid_json_input::
+ invalid_json_input (std::string n,
+ std::uint64_t l,
+ std::uint64_t c,
+ std::uint64_t p,
+ const std::string& d)
+ : invalid_json_input (move (n), l, c, p, d.c_str ())
+ {
+ }
+
+ inline invalid_json_input::
+ invalid_json_input (std::string n,
+ std::uint64_t l,
+ std::uint64_t c,
+ std::uint64_t p,
+ const char* d)
+ : invalid_argument (d),
+ name (std::move (n)),
+ line (l), column (c), position (p)
+ {
+ }
+
+ inline parser::
+ parser (std::istream& is,
+ const std::string& n,
+ bool mv,
+ const char* sep) noexcept
+ : parser (is, n.c_str (), mv, sep)
+ {
+ }
+
+ inline parser::
+ parser (const void* t,
+ std::size_t s,
+ const std::string& n,
+ bool mv,
+ const char* sep) noexcept
+ : parser (t, s, n.c_str (), mv, sep)
+ {
+ }
+
+ inline parser::
+ parser (const std::string& t,
+ const std::string& n,
+ bool mv,
+ const char* sep) noexcept
+ : parser (t.data (), t.size (), n.c_str (), mv, sep)
+ {
+ }
+
+ inline parser::
+ parser (const std::string& t,
+ const char* n,
+ bool mv,
+ const char* sep) noexcept
+ : parser (t.data (), t.size (), n, mv, sep)
+ {
+ }
+
+ inline parser::
+ parser (const char* t,
+ const std::string& n,
+ bool mv,
+ const char* sep) noexcept
+ : parser (t, std::strlen (t), n.c_str (), mv, sep)
+ {
+ }
+
+ inline parser::
+ parser (const char* t,
+ const char* n,
+ bool mv,
+ const char* sep) noexcept
+ : parser (t, std::strlen (t), n, mv, sep)
+ {
+ }
+
+ inline const std::string& parser::
+ name ()
+ {
+ if (!name_p_)
+ {
+ assert (parsed_ && !peeked_ && !value_p_);
+ cache_parsed_data ();
+ assert (name_p_);
+ }
+ return name_;
+ }
+
+ inline std::string& parser::
+ value ()
+ {
+ if (!value_p_)
+ {
+ assert (parsed_ && !peeked_ && !name_p_);
+ cache_parsed_data ();
+ assert (value_p_);
+ }
+ return value_;
+ }
+
+ // Note: one day we will be able to use C++17 from_chars() which was made
+ // exactly for this.
+ //
+ template <typename T>
+ inline typename std::enable_if<std::is_same<T, bool>::value, T>::type
+ parse_value (const char* b, size_t, const parser&)
+ {
+ return *b == 't';
+ }
+
+ template <typename T>
+ inline typename std::enable_if<
+ std::is_integral<T>::value &&
+ std::is_signed<T>::value &&
+ !std::is_same<T, bool>::value, T>::type
+ parse_value (const char* b, size_t n, const parser& p)
+ {
+ char* e (nullptr);
+ errno = 0; // We must clear it according to POSIX.
+ std::int64_t v (strtoll (b, &e, 10)); // Can't throw.
+
+ if (e == b || e != b + n || errno == ERANGE ||
+ v < std::numeric_limits<T>::min () ||
+ v > std::numeric_limits<T>::max ())
+ p.throw_invalid_value ("signed integer", b, n);
+
+ return static_cast<T> (v);
+ }
+
+ template <typename T>
+ inline typename std::enable_if<
+ std::is_integral<T>::value &&
+ std::is_unsigned<T>::value &&
+ !std::is_same<T, bool>::value, T>::type
+ parse_value (const char* b, size_t n, const parser& p)
+ {
+ char* e (nullptr);
+ errno = 0; // We must clear it according to POSIX.
+ std::uint64_t v (strtoull (b, &e, 10)); // Can't throw.
+
+ if (e == b || e != b + n || errno == ERANGE ||
+ v > std::numeric_limits<T>::max ())
+ p.throw_invalid_value ("unsigned integer", b, n);
+
+ return static_cast<T> (v);
+ }
+
+ template <typename T>
+ inline typename std::enable_if<std::is_same<T, float>::value, T>::type
+ parse_value (const char* b, size_t n, const parser& p)
+ {
+ char* e (nullptr);
+ errno = 0; // We must clear it according to POSIX.
+ T r (std::strtof (b, &e));
+
+ if (e == b || e != b + n || errno == ERANGE)
+ p.throw_invalid_value ("float", b, n);
+
+ return r;
+ }
+
+ template <typename T>
+ inline typename std::enable_if<std::is_same<T, double>::value, T>::type
+ parse_value (const char* b, size_t n, const parser& p)
+ {
+ char* e (nullptr);
+ errno = 0; // We must clear it according to POSIX.
+ T r (std::strtod (b, &e));
+
+ if (e == b || e != b + n || errno == ERANGE)
+ p.throw_invalid_value ("double", b, n);
+
+ return r;
+ }
+
+ template <typename T>
+ inline typename std::enable_if<std::is_same<T, long double>::value, T>::type
+ parse_value (const char* b, size_t n, const parser& p)
+ {
+ char* e (nullptr);
+ errno = 0; // We must clear it according to POSIX.
+ T r (std::strtold (b, &e));
+
+ if (e == b || e != b + n || errno == ERANGE)
+ p.throw_invalid_value ("long double", b, n);
+
+ return r;
+ }
+
+ template <typename T>
+ inline T parser::
+ value () const
+ {
+ if (!value_p_)
+ {
+ assert (parsed_ && !peeked_ && value_event (translate (*parsed_)));
+ return parse_value<T> (raw_s_, raw_n_, *this);
+ }
+
+ return parse_value<T> (value_.data (), value_.size (), *this);
+ }
+ }
+}
diff --git a/libbutl/json/pdjson.c b/libbutl/json/pdjson.c
new file mode 100644
index 0000000..279d169
--- /dev/null
+++ b/libbutl/json/pdjson.c
@@ -0,0 +1,1020 @@
+#ifndef _POSIX_C_SOURCE
+# define _POSIX_C_SOURCE 200112L
+#elif _POSIX_C_SOURCE < 200112L
+# error incompatible _POSIX_C_SOURCE level
+#endif
+
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+
+#ifndef PDJSON_H
+# include "pdjson.h"
+#endif
+
+#define JSON_FLAG_ERROR (1u << 0)
+#define JSON_FLAG_STREAMING (1u << 1)
+
+#if defined(_MSC_VER) && (_MSC_VER < 1900)
+
+#define json_error(json, format, ...) \
+ if (!(json->flags & JSON_FLAG_ERROR)) { \
+ json->flags |= JSON_FLAG_ERROR; \
+ _snprintf_s(json->errmsg, sizeof(json->errmsg), \
+ _TRUNCATE, \
+ format, \
+ __VA_ARGS__); \
+ } \
+
+#else
+
+#define json_error(json, format, ...) \
+ if (!(json->flags & JSON_FLAG_ERROR)) { \
+ json->flags |= JSON_FLAG_ERROR; \
+ snprintf(json->errmsg, sizeof(json->errmsg), \
+ format, \
+ __VA_ARGS__); \
+ } \
+
+#endif /* _MSC_VER */
+
+/* See also PDJSON_STACK_MAX below. */
+#ifndef PDJSON_STACK_INC
+# define PDJSON_STACK_INC 4
+#endif
+
+struct json_stack {
+ enum json_type type;
+ long count;
+};
+
+static enum json_type
+push(json_stream *json, enum json_type type)
+{
+ json->stack_top++;
+
+#ifdef PDJSON_STACK_MAX
+ if (json->stack_top > PDJSON_STACK_MAX) {
+ json_error(json, "%s", "maximum depth of nesting reached");
+ return JSON_ERROR;
+ }
+#endif
+
+ if (json->stack_top >= json->stack_size) {
+ struct json_stack *stack;
+ size_t size = (json->stack_size + PDJSON_STACK_INC) * sizeof(*json->stack);
+ stack = (struct json_stack *)json->alloc.realloc(json->stack, size);
+ if (stack == NULL) {
+ json_error(json, "%s", "out of memory");
+ return JSON_ERROR;
+ }
+
+ json->stack_size += PDJSON_STACK_INC;
+ json->stack = stack;
+ }
+
+ json->stack[json->stack_top].type = type;
+ json->stack[json->stack_top].count = 0;
+
+ return type;
+}
+
+static enum json_type
+pop(json_stream *json, int c, enum json_type expected)
+{
+ if (json->stack == NULL || json->stack[json->stack_top].type != expected) {
+ json_error(json, "unexpected byte '%c'", c);
+ return JSON_ERROR;
+ }
+ json->stack_top--;
+ return expected == JSON_ARRAY ? JSON_ARRAY_END : JSON_OBJECT_END;
+}
+
+static int buffer_peek(struct json_source *source)
+{
+ if (source->position < source->source.buffer.length)
+ return source->source.buffer.buffer[source->position];
+ else
+ return EOF;
+}
+
+static int buffer_get(struct json_source *source)
+{
+ int c = source->peek(source);
+ if (c != EOF)
+ source->position++;
+ return c;
+}
+
+static int stream_get(struct json_source *source)
+{
+ int c = fgetc(source->source.stream.stream);
+ if (c != EOF)
+ source->position++;
+ return c;
+}
+
+static int stream_peek(struct json_source *source)
+{
+ int c = fgetc(source->source.stream.stream);
+ ungetc(c, source->source.stream.stream);
+ return c;
+}
+
+static void init(json_stream *json)
+{
+ json->lineno = 1;
+ json->linepos = 0;
+ json->lineadj = 0;
+ json->linecon = 0;
+ json->colno = 0;
+ json->flags = JSON_FLAG_STREAMING;
+ json->errmsg[0] = '\0';
+ json->ntokens = 0;
+ json->next = (enum json_type)0;
+
+ json->stack = NULL;
+ json->stack_top = -1;
+ json->stack_size = 0;
+
+ json->data.string = NULL;
+ json->data.string_size = 0;
+ json->data.string_fill = 0;
+ json->source.position = 0;
+
+ json->alloc.malloc = malloc;
+ json->alloc.realloc = realloc;
+ json->alloc.free = free;
+}
+
+static enum json_type
+is_match(json_stream *json, const char *pattern, enum json_type type)
+{
+ int c;
+ for (const char *p = pattern; *p; p++) {
+ if (*p != (c = json->source.get(&json->source))) {
+ json_error(json, "expected '%c' instead of byte '%c'", *p, c);
+ return JSON_ERROR;
+ }
+ }
+ return type;
+}
+
+static int pushchar(json_stream *json, int c)
+{
+ if (json->data.string_fill == json->data.string_size) {
+ size_t size = json->data.string_size * 2;
+ char *buffer = (char *)json->alloc.realloc(json->data.string, size);
+ if (buffer == NULL) {
+ json_error(json, "%s", "out of memory");
+ return -1;
+ } else {
+ json->data.string_size = size;
+ json->data.string = buffer;
+ }
+ }
+ json->data.string[json->data.string_fill++] = c;
+ return 0;
+}
+
+static int init_string(json_stream *json)
+{
+ json->data.string_fill = 0;
+ if (json->data.string == NULL) {
+ json->data.string_size = 1024;
+ json->data.string = (char *)json->alloc.malloc(json->data.string_size);
+ if (json->data.string == NULL) {
+ json_error(json, "%s", "out of memory");
+ return -1;
+ }
+ }
+ json->data.string[0] = '\0';
+ return 0;
+}
+
+static int encode_utf8(json_stream *json, unsigned long c)
+{
+ if (c < 0x80UL) {
+ return pushchar(json, c);
+ } else if (c < 0x0800UL) {
+ return !((pushchar(json, (c >> 6 & 0x1F) | 0xC0) == 0) &&
+ (pushchar(json, (c >> 0 & 0x3F) | 0x80) == 0));
+ } else if (c < 0x010000UL) {
+ if (c >= 0xd800 && c <= 0xdfff) {
+ json_error(json, "invalid codepoint %06lx", c);
+ return -1;
+ }
+ return !((pushchar(json, (c >> 12 & 0x0F) | 0xE0) == 0) &&
+ (pushchar(json, (c >> 6 & 0x3F) | 0x80) == 0) &&
+ (pushchar(json, (c >> 0 & 0x3F) | 0x80) == 0));
+ } else if (c < 0x110000UL) {
+ return !((pushchar(json, (c >> 18 & 0x07) | 0xF0) == 0) &&
+ (pushchar(json, (c >> 12 & 0x3F) | 0x80) == 0) &&
+ (pushchar(json, (c >> 6 & 0x3F) | 0x80) == 0) &&
+ (pushchar(json, (c >> 0 & 0x3F) | 0x80) == 0));
+ } else {
+ json_error(json, "unable to encode %06lx as UTF-8", c);
+ return -1;
+ }
+}
+
+static int hexchar(int c)
+{
+ switch (c) {
+ case '0': return 0;
+ case '1': return 1;
+ case '2': return 2;
+ case '3': return 3;
+ case '4': return 4;
+ case '5': return 5;
+ case '6': return 6;
+ case '7': return 7;
+ case '8': return 8;
+ case '9': return 9;
+ case 'a':
+ case 'A': return 10;
+ case 'b':
+ case 'B': return 11;
+ case 'c':
+ case 'C': return 12;
+ case 'd':
+ case 'D': return 13;
+ case 'e':
+ case 'E': return 14;
+ case 'f':
+ case 'F': return 15;
+ default:
+ return -1;
+ }
+}
+
+static long
+read_unicode_cp(json_stream *json)
+{
+ long cp = 0;
+ int shift = 12;
+
+ for (size_t i = 0; i < 4; i++) {
+ int c = json->source.get(&json->source);
+ int hc;
+
+ if (c == EOF) {
+ json_error(json, "%s", "unterminated string literal in Unicode");
+ return -1;
+ } else if ((hc = hexchar(c)) == -1) {
+ json_error(json, "invalid escape Unicode byte '%c'", c);
+ return -1;
+ }
+
+ cp += hc * (1 << shift);
+ shift -= 4;
+ }
+
+
+ return cp;
+}
+
+static int read_unicode(json_stream *json)
+{
+ long cp, h, l;
+
+ if ((cp = read_unicode_cp(json)) == -1) {
+ return -1;
+ }
+
+ if (cp >= 0xd800 && cp <= 0xdbff) {
+ /* This is the high portion of a surrogate pair; we need to read the
+ * lower portion to get the codepoint
+ */
+ h = cp;
+
+ int c = json->source.get(&json->source);
+ if (c == EOF) {
+ json_error(json, "%s", "unterminated string literal in Unicode");
+ return -1;
+ } else if (c != '\\') {
+ json_error(json, "invalid continuation for surrogate pair '%c', "
+ "expected '\\'", c);
+ return -1;
+ }
+
+ c = json->source.get(&json->source);
+ if (c == EOF) {
+ json_error(json, "%s", "unterminated string literal in Unicode");
+ return -1;
+ } else if (c != 'u') {
+ json_error(json, "invalid continuation for surrogate pair '%c', "
+ "expected 'u'", c);
+ return -1;
+ }
+
+ if ((l = read_unicode_cp(json)) == -1) {
+ return -1;
+ }
+
+ if (l < 0xdc00 || l > 0xdfff) {
+ json_error(json, "surrogate pair continuation \\u%04lx out "
+ "of range (dc00-dfff)", l);
+ return -1;
+ }
+
+ cp = ((h - 0xd800) * 0x400) + ((l - 0xdc00) + 0x10000);
+ } else if (cp >= 0xdc00 && cp <= 0xdfff) {
+ json_error(json, "dangling surrogate \\u%04lx", cp);
+ return -1;
+ }
+
+ return encode_utf8(json, cp);
+}
+
+static int
+read_escaped(json_stream *json)
+{
+ int c = json->source.get(&json->source);
+ if (c == EOF) {
+ json_error(json, "%s", "unterminated string literal in escape");
+ return -1;
+ } else if (c == 'u') {
+ if (read_unicode(json) != 0)
+ return -1;
+ } else {
+ switch (c) {
+ case '\\':
+ case 'b':
+ case 'f':
+ case 'n':
+ case 'r':
+ case 't':
+ case '/':
+ case '"':
+ {
+ const char *codes = "\\bfnrt/\"";
+ const char *p = strchr(codes, c);
+ if (pushchar(json, "\\\b\f\n\r\t/\""[p - codes]) != 0)
+ return -1;
+ }
+ break;
+ default:
+ json_error(json, "invalid escaped byte '%c'", c);
+ return -1;
+ }
+ }
+ return 0;
+}
+
+static int
+char_needs_escaping(int c)
+{
+ if ((c >= 0) && (c < 0x20 || c == 0x22 || c == 0x5c)) {
+ return 1;
+ }
+
+ return 0;
+}
+
+static int
+utf8_seq_length(char byte)
+{
+ unsigned char u = (unsigned char) byte;
+ if (u < 0x80) return 1;
+
+ if (0x80 <= u && u <= 0xBF)
+ {
+ // second, third or fourth byte of a multi-byte
+ // sequence, i.e. a "continuation byte"
+ return 0;
+ }
+ else if (u == 0xC0 || u == 0xC1)
+ {
+ // overlong encoding of an ASCII byte
+ return 0;
+ }
+ else if (0xC2 <= u && u <= 0xDF)
+ {
+ // 2-byte sequence
+ return 2;
+ }
+ else if (0xE0 <= u && u <= 0xEF)
+ {
+ // 3-byte sequence
+ return 3;
+ }
+ else if (0xF0 <= u && u <= 0xF4)
+ {
+ // 4-byte sequence
+ return 4;
+ }
+ else
+ {
+ // u >= 0xF5
+ // Restricted (start of 4-, 5- or 6-byte sequence) or invalid UTF-8
+ return 0;
+ }
+}
+
+static int
+is_legal_utf8(const unsigned char *bytes, int length)
+{
+ if (0 == bytes || 0 == length) return 0;
+
+ unsigned char a;
+ const unsigned char* srcptr = bytes + length;
+ switch (length)
+ {
+ default:
+ return 0;
+ // Everything else falls through when true.
+ case 4:
+ if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0;
+ /* FALLTHRU */
+ case 3:
+ if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return 0;
+ /* FALLTHRU */
+ case 2:
+ a = (*--srcptr);
+ switch (*bytes)
+ {
+ case 0xE0:
+ if (a < 0xA0 || a > 0xBF) return 0;
+ break;
+ case 0xED:
+ if (a < 0x80 || a > 0x9F) return 0;
+ break;
+ case 0xF0:
+ if (a < 0x90 || a > 0xBF) return 0;
+ break;
+ case 0xF4:
+ if (a < 0x80 || a > 0x8F) return 0;
+ break;
+ default:
+ if (a < 0x80 || a > 0xBF) return 0;
+ break;
+ }
+ /* FALLTHRU */
+ case 1:
+ if (*bytes >= 0x80 && *bytes < 0xC2) return 0;
+ }
+ return *bytes <= 0xF4;
+}
+
+static int
+read_utf8(json_stream* json, int next_char)
+{
+ int count = utf8_seq_length(next_char);
+ if (!count)
+ {
+ json_error(json, "%s", "invalid UTF-8 character");
+ return -1;
+ }
+
+ char buffer[4];
+ buffer[0] = next_char;
+ int i;
+ for (i = 1; i < count; ++i)
+ {
+ if ((buffer[i] = json->source.get(&json->source)) != EOF)
+ json->lineadj++;
+ }
+
+ if (!is_legal_utf8((unsigned char*) buffer, count))
+ {
+ json_error(json, "%s", "invalid UTF-8 text");
+ return -1;
+ }
+
+ for (i = 0; i < count; ++i)
+ {
+ if (pushchar(json, buffer[i]) != 0)
+ return -1;
+ }
+ return 0;
+}
+
+static enum json_type
+read_string(json_stream *json)
+{
+ if (init_string(json) != 0)
+ return JSON_ERROR;
+ while (1) {
+ int c = json->source.get(&json->source);
+ if (c == EOF) {
+ json_error(json, "%s", "unterminated string literal");
+ return JSON_ERROR;
+ } else if (c == '"') {
+ if (pushchar(json, '\0') == 0)
+ return JSON_STRING;
+ else
+ return JSON_ERROR;
+ } else if (c == '\\') {
+ if (read_escaped(json) != 0)
+ return JSON_ERROR;
+ } else if ((unsigned) c >= 0x80) {
+ if (read_utf8(json, c) != 0)
+ return JSON_ERROR;
+ } else {
+ if (char_needs_escaping(c)) {
+ json_error(json, "%s", "unescaped control character in string");
+ return JSON_ERROR;
+ }
+
+ if (pushchar(json, c) != 0)
+ return JSON_ERROR;
+ }
+ }
+ return JSON_ERROR;
+}
+
+static int
+is_digit(int c)
+{
+ return c >= 48 /*0*/ && c <= 57 /*9*/;
+}
+
+static int
+read_digits(json_stream *json)
+{
+ int c;
+ unsigned nread = 0;
+ while (is_digit(c = json->source.peek(&json->source))) {
+ if (pushchar(json, json->source.get(&json->source)) != 0)
+ return -1;
+
+ nread++;
+ }
+
+ if (nread == 0) {
+ json_error(json, "expected digit instead of byte '%c'", c);
+ return -1;
+ }
+
+ return 0;
+}
+
+static enum json_type
+read_number(json_stream *json, int c)
+{
+ if (pushchar(json, c) != 0)
+ return JSON_ERROR;
+ if (c == '-') {
+ c = json->source.get(&json->source);
+ if (is_digit(c)) {
+ return read_number(json, c);
+ } else {
+ json_error(json, "unexpected byte '%c' in number", c);
+ return JSON_ERROR;
+ }
+ } else if (strchr("123456789", c) != NULL) {
+ c = json->source.peek(&json->source);
+ if (is_digit(c)) {
+ if (read_digits(json) != 0)
+ return JSON_ERROR;
+ }
+ }
+ /* Up to decimal or exponent has been read. */
+ c = json->source.peek(&json->source);
+ if (strchr(".eE", c) == NULL) {
+ if (pushchar(json, '\0') != 0)
+ return JSON_ERROR;
+ else
+ return JSON_NUMBER;
+ }
+ if (c == '.') {
+ json->source.get(&json->source); // consume .
+ if (pushchar(json, c) != 0)
+ return JSON_ERROR;
+ if (read_digits(json) != 0)
+ return JSON_ERROR;
+ }
+ /* Check for exponent. */
+ c = json->source.peek(&json->source);
+ if (c == 'e' || c == 'E') {
+ json->source.get(&json->source); // consume e/E
+ if (pushchar(json, c) != 0)
+ return JSON_ERROR;
+ c = json->source.peek(&json->source);
+ if (c == '+' || c == '-') {
+ json->source.get(&json->source); // consume
+ if (pushchar(json, c) != 0)
+ return JSON_ERROR;
+ if (read_digits(json) != 0)
+ return JSON_ERROR;
+ } else if (is_digit(c)) {
+ if (read_digits(json) != 0)
+ return JSON_ERROR;
+ } else {
+ json->source.get(&json->source); // consume (for column)
+ json_error(json, "unexpected byte '%c' in number", c);
+ return JSON_ERROR;
+ }
+ }
+ if (pushchar(json, '\0') != 0)
+ return JSON_ERROR;
+ else
+ return JSON_NUMBER;
+}
+
+bool
+json_isspace(int c)
+{
+ switch (c) {
+ case 0x09:
+ case 0x0a:
+ case 0x0d:
+ case 0x20:
+ return true;
+ }
+
+ return false;
+}
+
+static void newline(json_stream *json)
+{
+ json->lineno++;
+ json->linepos = json->source.position;
+ json->lineadj = 0;
+ json->linecon = 0;
+}
+
+/* Returns the next non-whitespace character in the stream.
+ *
+ * Note that this is the only function (besides user-facing json_source_get())
+ * that needs to worry about newline housekeeping.
+ */
+static int next(json_stream *json)
+{
+ int c;
+ while (json_isspace(c = json->source.get(&json->source)))
+ if (c == '\n')
+ newline(json);
+ return c;
+}
+
+static enum json_type
+read_value(json_stream *json, int c)
+{
+ enum json_type type;
+ size_t colno = json_get_column(json);
+
+ json->ntokens++;
+
+ switch (c) {
+ case EOF:
+ json_error(json, "%s", "unexpected end of text");
+ type = JSON_ERROR;
+ break;
+ case '{':
+ type = push(json, JSON_OBJECT);
+ break;
+ case '[':
+ type = push(json, JSON_ARRAY);
+ break;
+ case '"':
+ type = read_string(json);
+ break;
+ case 'n':
+ type = is_match(json, "ull", JSON_NULL);
+ break;
+ case 'f':
+ type = is_match(json, "alse", JSON_FALSE);
+ break;
+ case 't':
+ type = is_match(json, "rue", JSON_TRUE);
+ break;
+ case '0':
+ case '1':
+ case '2':
+ case '3':
+ case '4':
+ case '5':
+ case '6':
+ case '7':
+ case '8':
+ case '9':
+ case '-':
+ type = init_string(json) == 0 ? read_number(json, c) : JSON_ERROR;
+ break;
+ default:
+ type = JSON_ERROR;
+ json_error(json, "unexpected byte '%c' in value", c);
+ break;
+ }
+
+ if (type != JSON_ERROR)
+ json->colno = colno;
+
+ return type;
+}
+
+enum json_type json_peek(json_stream *json)
+{
+ enum json_type next;
+ if (json->next)
+ next = json->next;
+ else
+ next = json->next = json_next(json);
+ return next;
+}
+
+enum json_type json_next(json_stream *json)
+{
+ if (json->flags & JSON_FLAG_ERROR)
+ return JSON_ERROR;
+ if (json->next != 0) {
+ enum json_type next = json->next;
+ json->next = (enum json_type)0;
+ return next;
+ }
+
+ json->colno = 0;
+
+ if (json->ntokens > 0 && json->stack_top == (size_t)-1) {
+
+ /* In the streaming mode leave any trailing whitespaces in the stream.
+ * This allows the user to validate any desired separation between
+ * values (such as newlines) using json_source_get/peek() with any
+ * remaining whitespaces ignored as leading when we parse the next
+ * value. */
+ if (!(json->flags & JSON_FLAG_STREAMING)) {
+ int c = next(json);
+ if (c != EOF) {
+ json_error(json, "expected end of text instead of byte '%c'", c);
+ return JSON_ERROR;
+ }
+ }
+
+ return JSON_DONE;
+ }
+ int c = next(json);
+ if (json->stack_top == (size_t)-1) {
+ if (c == EOF && (json->flags & JSON_FLAG_STREAMING))
+ return JSON_DONE;
+
+ return read_value(json, c);
+ }
+ if (json->stack[json->stack_top].type == JSON_ARRAY) {
+ if (json->stack[json->stack_top].count == 0) {
+ if (c == ']') {
+ return pop(json, c, JSON_ARRAY);
+ }
+ json->stack[json->stack_top].count++;
+ return read_value(json, c);
+ } else if (c == ',') {
+ json->stack[json->stack_top].count++;
+ return read_value(json, next(json));
+ } else if (c == ']') {
+ return pop(json, c, JSON_ARRAY);
+ } else {
+ json_error(json, "unexpected byte '%c'", c);
+ return JSON_ERROR;
+ }
+ } else if (json->stack[json->stack_top].type == JSON_OBJECT) {
+ if (json->stack[json->stack_top].count == 0) {
+ if (c == '}') {
+ return pop(json, c, JSON_OBJECT);
+ }
+
+ /* No member name/value pairs yet. */
+ enum json_type value = read_value(json, c);
+ if (value != JSON_STRING) {
+ if (value != JSON_ERROR)
+ json_error(json, "%s", "expected member name or '}'");
+ return JSON_ERROR;
+ } else {
+ json->stack[json->stack_top].count++;
+ return value;
+ }
+ } else if ((json->stack[json->stack_top].count % 2) == 0) {
+ /* Expecting comma followed by member name. */
+ if (c != ',' && c != '}') {
+ json_error(json, "%s", "expected ',' or '}' after member value");
+ return JSON_ERROR;
+ } else if (c == '}') {
+ return pop(json, c, JSON_OBJECT);
+ } else {
+ enum json_type value = read_value(json, next(json));
+ if (value != JSON_STRING) {
+ if (value != JSON_ERROR)
+ json_error(json, "%s", "expected member name");
+ return JSON_ERROR;
+ } else {
+ json->stack[json->stack_top].count++;
+ return value;
+ }
+ }
+ } else if ((json->stack[json->stack_top].count % 2) == 1) {
+ /* Expecting colon followed by value. */
+ if (c != ':') {
+ json_error(json, "%s", "expected ':' after member name");
+ return JSON_ERROR;
+ } else {
+ json->stack[json->stack_top].count++;
+ return read_value(json, next(json));
+ }
+ }
+ }
+ json_error(json, "%s", "invalid parser state");
+ return JSON_ERROR;
+}
+
+void json_reset(json_stream *json)
+{
+ json->stack_top = -1;
+ json->ntokens = 0;
+ json->flags &= ~JSON_FLAG_ERROR;
+ json->errmsg[0] = '\0';
+}
+
+enum json_type json_skip(json_stream *json)
+{
+ enum json_type type = json_next(json);
+ size_t cnt_arr = 0;
+ size_t cnt_obj = 0;
+
+ for (enum json_type skip = type; ; skip = json_next(json)) {
+ if (skip == JSON_ERROR || skip == JSON_DONE)
+ return skip;
+
+ if (skip == JSON_ARRAY) {
+ ++cnt_arr;
+ } else if (skip == JSON_ARRAY_END && cnt_arr > 0) {
+ --cnt_arr;
+ } else if (skip == JSON_OBJECT) {
+ ++cnt_obj;
+ } else if (skip == JSON_OBJECT_END && cnt_obj > 0) {
+ --cnt_obj;
+ }
+
+ if (!cnt_arr && !cnt_obj)
+ break;
+ }
+
+ return type;
+}
+
+enum json_type json_skip_until(json_stream *json, enum json_type type)
+{
+ while (1) {
+ enum json_type skip = json_skip(json);
+
+ if (skip == JSON_ERROR || skip == JSON_DONE)
+ return skip;
+
+ if (skip == type)
+ break;
+ }
+
+ return type;
+}
+
+const char *json_get_string(json_stream *json, size_t *length)
+{
+ if (length != NULL)
+ *length = json->data.string_fill;
+ if (json->data.string == NULL)
+ return "";
+ else
+ return json->data.string;
+}
+
+double json_get_number(json_stream *json)
+{
+ char *p = json->data.string;
+ return p == NULL ? 0 : strtod(p, NULL);
+}
+
+const char *json_get_error(json_stream *json)
+{
+ return json->flags & JSON_FLAG_ERROR ? json->errmsg : NULL;
+}
+
+size_t json_get_lineno(json_stream *json)
+{
+ return json->lineno;
+}
+
+size_t json_get_position(json_stream *json)
+{
+ return json->source.position;
+}
+
+size_t json_get_column(json_stream *json)
+{
+ return json->colno == 0
+ ? json->source.position == 0 ? 1 : json->source.position - json->linepos - json->lineadj
+ : json->colno;
+}
+
+size_t json_get_depth(json_stream *json)
+{
+ return json->stack_top + 1;
+}
+
+/* Return the current parsing context, that is, JSON_OBJECT if we are inside
+ an object, JSON_ARRAY if we are inside an array, and JSON_DONE if we are
+ not yet/anymore in either.
+
+ Additionally, for the first two cases, also return the number of parsing
+ events that have already been observed at this level with json_next/peek().
+ In particular, inside an object, an odd number would indicate that the just
+ observed JSON_STRING event is a member name.
+*/
+enum json_type json_get_context(json_stream *json, size_t *count)
+{
+ if (json->stack_top == (size_t)-1)
+ return JSON_DONE;
+
+ if (count != NULL)
+ *count = json->stack[json->stack_top].count;
+
+ return json->stack[json->stack_top].type;
+}
+
+int json_source_get(json_stream *json)
+{
+ /* If the caller reads a multi-byte UTF-8 sequence, we expect them to read
+ * it in its entirety. We also assume that any invalid bytes within such a
+ * sequence belong to the same column (as opposed to starting a new column
+ * or some such). */
+
+ int c = json->source.get(&json->source);
+ if (json->linecon > 0) {
+ /* Expecting a continuation byte within a multi-byte UTF-8 sequence. */
+ json->linecon--;
+ if (c != EOF)
+ json->lineadj++;
+ } else if (c == '\n')
+ newline(json);
+ else if (c >= 0xC2 && c <= 0xF4) /* First in multi-byte UTF-8 sequence. */
+ json->linecon = utf8_seq_length(c) - 1;
+
+ return c;
+}
+
+int json_source_peek(json_stream *json)
+{
+ return json->source.peek(&json->source);
+}
+
+void json_open_buffer(json_stream *json, const void *buffer, size_t size)
+{
+ init(json);
+ json->source.get = buffer_get;
+ json->source.peek = buffer_peek;
+ json->source.source.buffer.buffer = (const char *)buffer;
+ json->source.source.buffer.length = size;
+}
+
+void json_open_string(json_stream *json, const char *string)
+{
+ json_open_buffer(json, string, strlen(string));
+}
+
+void json_open_stream(json_stream *json, FILE * stream)
+{
+ init(json);
+ json->source.get = stream_get;
+ json->source.peek = stream_peek;
+ json->source.source.stream.stream = stream;
+}
+
+static int user_get(struct json_source *json)
+{
+ int c = json->source.user.get(json->source.user.ptr);
+ if (c != EOF)
+ json->position++;
+ return c;
+}
+
+static int user_peek(struct json_source *json)
+{
+ return json->source.user.peek(json->source.user.ptr);
+}
+
+void json_open_user(json_stream *json, json_user_io get, json_user_io peek, void *user)
+{
+ init(json);
+ json->source.get = user_get;
+ json->source.peek = user_peek;
+ json->source.source.user.ptr = user;
+ json->source.source.user.get = get;
+ json->source.source.user.peek = peek;
+}
+
+void json_set_allocator(json_stream *json, json_allocator *a)
+{
+ json->alloc = *a;
+}
+
+void json_set_streaming(json_stream *json, bool streaming)
+{
+ if (streaming)
+ json->flags |= JSON_FLAG_STREAMING;
+ else
+ json->flags &= ~JSON_FLAG_STREAMING;
+}
+
+void json_close(json_stream *json)
+{
+ json->alloc.free(json->stack);
+ json->alloc.free(json->data.string);
+}
diff --git a/libbutl/json/pdjson.h b/libbutl/json/pdjson.h
new file mode 100644
index 0000000..ac698e4
--- /dev/null
+++ b/libbutl/json/pdjson.h
@@ -0,0 +1,147 @@
+#ifndef PDJSON_H
+#define PDJSON_H
+
+#ifndef PDJSON_SYMEXPORT
+# define PDJSON_SYMEXPORT
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#else
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
+ #include <stdbool.h>
+#else
+ #ifndef bool
+ #define bool int
+ #define true 1
+ #define false 0
+ #endif /* bool */
+#endif /* __STDC_VERSION__ */
+#endif /* __cplusplus */
+
+#include <stdio.h>
+
+enum json_type {
+ JSON_ERROR = 1, JSON_DONE,
+ JSON_OBJECT, JSON_OBJECT_END, JSON_ARRAY, JSON_ARRAY_END,
+ JSON_STRING, JSON_NUMBER, JSON_TRUE, JSON_FALSE, JSON_NULL
+};
+
+struct json_allocator {
+ void *(*malloc)(size_t);
+ void *(*realloc)(void *, size_t);
+ void (*free)(void *);
+};
+
+typedef int (*json_user_io)(void *user);
+
+typedef struct json_stream json_stream;
+typedef struct json_allocator json_allocator;
+
+PDJSON_SYMEXPORT void json_open_buffer(json_stream *json, const void *buffer, size_t size);
+PDJSON_SYMEXPORT void json_open_string(json_stream *json, const char *string);
+PDJSON_SYMEXPORT void json_open_stream(json_stream *json, FILE *stream);
+PDJSON_SYMEXPORT void json_open_user(json_stream *json, json_user_io get, json_user_io peek, void *user);
+PDJSON_SYMEXPORT void json_close(json_stream *json);
+
+PDJSON_SYMEXPORT void json_set_allocator(json_stream *json, json_allocator *a);
+PDJSON_SYMEXPORT void json_set_streaming(json_stream *json, bool mode);
+
+PDJSON_SYMEXPORT enum json_type json_next(json_stream *json);
+PDJSON_SYMEXPORT enum json_type json_peek(json_stream *json);
+PDJSON_SYMEXPORT void json_reset(json_stream *json);
+PDJSON_SYMEXPORT const char *json_get_string(json_stream *json, size_t *length);
+PDJSON_SYMEXPORT double json_get_number(json_stream *json);
+
+PDJSON_SYMEXPORT enum json_type json_skip(json_stream *json);
+PDJSON_SYMEXPORT enum json_type json_skip_until(json_stream *json, enum json_type type);
+
+PDJSON_SYMEXPORT size_t json_get_lineno(json_stream *json);
+PDJSON_SYMEXPORT size_t json_get_position(json_stream *json);
+PDJSON_SYMEXPORT size_t json_get_column(json_stream *json);
+PDJSON_SYMEXPORT size_t json_get_depth(json_stream *json);
+PDJSON_SYMEXPORT enum json_type json_get_context(json_stream *json, size_t *count);
+PDJSON_SYMEXPORT const char *json_get_error(json_stream *json);
+
+PDJSON_SYMEXPORT int json_source_get(json_stream *json);
+PDJSON_SYMEXPORT int json_source_peek(json_stream *json);
+PDJSON_SYMEXPORT bool json_isspace(int c);
+
+/* internal */
+
+struct json_source {
+ int (*get)(struct json_source *);
+ int (*peek)(struct json_source *);
+ size_t position;
+ union {
+ struct {
+ FILE *stream;
+ } stream;
+ struct {
+ const char *buffer;
+ size_t length;
+ } buffer;
+ struct {
+ void *ptr;
+ json_user_io get;
+ json_user_io peek;
+ } user;
+ } source;
+};
+
+struct json_stream {
+ size_t lineno;
+
+ /* While counting lines is straightforward, columns are tricky because we
+ * have to count codepoints, not bytes. We could have peppered the code
+ * with increments in all the relevant places but that seems inelegant.
+ * So instead we calculate the column dynamically, based on the current
+ * position.
+ *
+ * Specifically, we will remember the position at the beginning of each
+ * line (linepos) and, assuming only the ASCII characters on the line, the
+ * column will be the difference between the current position and linepos.
+ * Of course there could also be multi-byte UTF-8 sequences which we will
+ * handle by keeping an adjustment (lineadj) -- the number of continuation
+ * bytes encountered on this line so far. Finally, for json_source_get()
+ * we also have to keep the number of remaining continuation bytes in the
+ * current multi-byte UTF-8 sequence (linecon).
+ *
+ * This is not the end of the story, however: with only the just described
+ * approach we will always end up with the column of the latest character
+ * read which is not what we want when returning potentially multi-
+ * character value events (string, number, etc); in these cases we want to
+ * return the column of the first character (note that if the value itself
+ * is invalid and we are returning JSON_ERROR, we still want the current
+ * column). So to handle this we will cache the start column (colno) for
+ * such events.
+ */
+ size_t linepos; /* Position at the beginning of the current line. */
+ size_t lineadj; /* Adjustment for multi-byte UTF-8 sequences. */
+ size_t linecon; /* Number of remaining UTF-8 continuation bytes. */
+ size_t colno; /* Start column for value events or 0. */
+
+ struct json_stack *stack;
+ size_t stack_top;
+ size_t stack_size;
+ enum json_type next;
+ unsigned flags;
+
+ struct {
+ char *string;
+ size_t string_fill;
+ size_t string_size;
+ } data;
+
+ size_t ntokens;
+
+ struct json_source source;
+ struct json_allocator alloc;
+ char errmsg[128];
+};
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
+
+#endif