aboutsummaryrefslogtreecommitdiff
path: root/libbutl/json/serializer.cxx
blob: 84941ed6290f99049cc21c2b11630f36523604ea (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
#include <cstdio>               // snprintf
#include <cstdarg>              // va_list
#include <cstring>              // memcpy
#include <ostream>

#include <libbutl/json/serializer.hxx>

using namespace std;

namespace butl
{
  namespace json
  {
    using buffer     = buffer_serializer::buffer;
    using error_code = invalid_json_output::error_code;

    template <typename T>
    static void
    dynarray_overflow (void* d, event, buffer& b, size_t ex)
    {
      T& v (*static_cast<T*> (d));
      v.resize (b.capacity + ex);
      v.resize (v.capacity ());
      // const_cast is required for std::string pre C++17.
      //
      b.data = const_cast<typename T::value_type*> (v.data ());
      b.capacity = v.size ();
    }

    template <typename T>
    static void
    dynarray_flush (void* d, event, buffer& b)
    {
      T& v (*static_cast<T*> (d));
      v.resize (b.size);
      b.data = const_cast<typename T::value_type*> (v.data ());
      b.capacity = b.size;
    }

    buffer_serializer::
    buffer_serializer (string& s, size_t i)
        : buffer_serializer (const_cast<char*> (s.data ()), size_, s.size (),
                             dynarray_overflow<string>,
                             dynarray_flush<string>,
                             &s,
                             i)
    {
      size_ = s.size ();
    }

    buffer_serializer::
    buffer_serializer (vector<char>& v, size_t i)
        : buffer_serializer (v.data (), size_, v.size (),
                             dynarray_overflow<vector<char>>,
                             dynarray_flush<vector<char>>,
                             &v,
                             i)
    {
      size_ = v.size ();
    }

    static void
    ostream_overflow (void* d, event e, buffer& b, size_t)
    {
      ostream& s (*static_cast<ostream*> (d));
      s.write (static_cast<char*> (b.data), b.size);
      if (s.fail ())
        throw invalid_json_output (
            e, error_code::buffer_overflow, "unable to write JSON output text");
      b.size = 0;
    }

    static void
    ostream_flush (void* d, event e, buffer& b)
    {
      ostream_overflow (d, e, b, 0);

      ostream& s (*static_cast<ostream*> (d));
      s.flush ();
      if (s.fail ())
        throw invalid_json_output (
            e, error_code::buffer_overflow, "unable to write JSON output text");
    }

    stream_serializer::
    stream_serializer (ostream& os, size_t i)
        : buffer_serializer (tmp_, sizeof (tmp_),
                             ostream_overflow,
                             ostream_flush,
                             &os,
                             i)
    {
    }

    bool buffer_serializer::
    next (optional<event> e, pair<const char*, size_t> val, bool check)
    {
      if (absent_ == 2)
        goto fail_complete;

      if (e == nullopt)
      {
        if (!state_.empty ())
          goto fail_incomplete;

        absent_++;
        return false;
      }

      absent_ = 0; // Clear inter-value absent event.

      {
        state* st (state_.empty () ? nullptr : &state_.back ());

        auto name_expected = [] (const state& s)
        {
          return s.type == event::begin_object && s.count % 2 == 0;
        };

        auto make_str = [] (const char* s, size_t n)
        {
          return make_pair (s, n);
        };

        // When it comes to pretty-printing, the common way to do it is along
        // these lines:
        //
        // {
        //   "str": "value",
        //   "obj": {
        //     "arr": [
        //       1,
        //       2,
        //       3
        //     ]
        //   },
        //   "num": 123
        // }
        //
        // Empty objects and arrays are printed without a newline:
        //
        // {
        //   "obj": {},
        //   "arr": []
        // }
        //
        // There are two types of separators: between name and value, which is
        // always ": ", and before/after value inside an object or array which
        // is either newline followed by indentation, or comma followed by
        // newline followed by indentation (we also have separation between
        // top-level values but that's orthogonal to pretty-printing).
        //
        // Based on this observation, we are going to handle the latter case by
        // starting with the ",\n" string (in this->sep_) and pushing/popping
        // indentation spaces as we enter/leave objects and arrays. We handle
        // the cases where we don't need the comma by simply skipping it in the
        // C-string pointer.
        //
        bool pp (indent_ != 0);

        pair<const char*, size_t> sep;
        if (st != nullptr)
        {
          // The name-value separator.
          //
          if (st->type == event::begin_object && st->count % 2 == 1)
          {
            sep = !pp ? make_str (":", 1) : make_str (": ", 2);
          }
          // We don't need the comma if we are closing the object or array.
          //
          else if (e == event::end_array || e == event::end_object)
          {
            // But in this case we need to unindent one level prior to writing
            // the brace. Also handle the empty object/array as a special case.
            //
            sep = !pp || st->count == 0
              ? make_str (nullptr, 0)
              : make_str (sep_.c_str () + 1, sep_.size () - 1 - indent_);
          }
          // Or if this is the first value (note: must come after end_*).
          //
          else if (st->count == 0)
          {
            sep = !pp
              ? make_str (nullptr, 0)
              : make_str (sep_.c_str () + 1, sep_.size () - 1);
          }
          else
          {
            sep = !pp
              ? make_str (",", 1)
              : make_str (sep_.c_str (), sep_.size ());
          }
        }
        else if (values_ != 0) // Subsequent top-level value.
        {
          // Top-level value separation. For now we always separate them with
          // newlines, which is the most common/sensible way.
          //
          sep = make_str ("\n", 1);
        }

        switch (*e)
        {
        case event::begin_array:
        case event::begin_object:
          {
            if (st != nullptr && name_expected (*st))
              goto fail_unexpected_event;

            write (*e,
                   sep,
                   make_str (e == event::begin_array ? "[" : "{", 1),
                   false);

            if (st != nullptr)
              st->count++;

            if (pp)
              sep_.append (indent_, ' ');

            state_.push_back (state {*e, 0});
            break;
          }
        case event::end_array:
        case event::end_object:
          {
            if (st == nullptr || (e == event::end_array
                                  ? st->type != event::begin_array
                                  : !name_expected (*st)))
              goto fail_unexpected_event;

            write (*e,
                   sep,
                   make_str (e == event::end_array ? "]" : "}", 1),
                   false);

            if (pp)
              sep_.erase (sep_.size () - indent_);

            state_.pop_back ();
            break;
          }
        case event::name:
        case event::string:
          {
            if (e == event::name
                ? (st == nullptr || !name_expected (*st))
                : (st != nullptr && name_expected (*st)))
              goto fail_unexpected_event;

            write (*e, sep, val, check, '"');

            if (st != nullptr)
              st->count++;
            break;
          }
        case event::null:
        case event::boolean:
          {
            if (e == event::null && val.first == nullptr)
              val = {"null", 4};
            else if (check)
            {
              auto eq = [&val] (const char* v, size_t n)
              {
                return val.second == n && memcmp (val.first, v, n) == 0;
              };

              if (e == event::null)
              {
                if (!eq ("null", 4))
                  goto fail_null;
              }
              else
              {
                if (!eq ("true", 4) && !eq ("false", 5))
                  goto fail_bool;
              }
            }
          }
          // Fall through.
        case event::number:
          {
            if (st != nullptr && name_expected (*st))
              goto fail_unexpected_event;

            write (*e, sep, val, check);

            if (st != nullptr)
              st->count++;
            break;
          }
        }
      }

      if (state_.empty ())
      {
        values_++;
        if (flush_ != nullptr)
          flush_ (data_, *e, buf_);

        return false;
      }

      return true;

    fail_complete:
      throw invalid_json_output (
          e, error_code::invalid_value, "value sequence is complete");
    fail_incomplete:
      throw invalid_json_output (
          e, error_code::invalid_value, "value is incomplete");
    fail_null:
      throw invalid_json_output (
          e, error_code::invalid_value, "invalid null value");
    fail_bool:
      throw invalid_json_output (
          e, error_code::invalid_value, "invalid boolean value");
    fail_unexpected_event:
      throw invalid_json_output (
          e, error_code::unexpected_event, "unexpected event");
    }

    // JSON escape sequences for control characters <= 0x1F.
    //
    static const char* json_escapes[] =
    {"\\u0000", "\\u0001", "\\u0002", "\\u0003", "\\u0004", "\\u0005",
     "\\u0006", "\\u0007", "\\b",     "\\t",     "\\n",     "\\u000B",
     "\\f",     "\\r",     "\\u000E", "\\u000F", "\\u0010", "\\u0011",
     "\\u0012", "\\u0013", "\\u0014", "\\u0015", "\\u0016", "\\u0017",
     "\\u0018", "\\u0019", "\\u001A", "\\u001B", "\\u001C", "\\u001D",
     "\\u001E", "\\u001F"};

    void buffer_serializer::
    write (event e,
           pair<const char*, size_t> sep,
           pair<const char*, size_t> val,
           bool check,
           char q)
    {
      // Assumptions:
      //
      // 1. A call to overflow should be able to provide enough capacity to
      //    write the entire separator (in other words, we are not going to
      //    bother with chunking the separator).
      //
      // 2. Similarly, a call to overflow should be able to provide enough
      //    capacity to write an entire UTF-8 multi-byte sequence.
      //
      // 3. Performance-wise, we do not expect very long contiguous sequences
      //    of character that require escaping.

      // Total number of bytes remaining to be written and the capacity
      // currently available.
      //
      size_t size (sep.second + val.second + (q != '\0' ? 2 : 0));
      size_t cap (buf_.capacity - buf_.size);

      auto grow = [this, e, &size, &cap] (size_t min, size_t extra = 0)
      {
        if (overflow_ == nullptr)
          return false;

        extra += size;
        extra -= cap;
        overflow_ (data_, e, buf_, extra > min ? extra : min);
        cap = buf_.capacity - buf_.size;

        return cap >= min;
      };

      auto append = [this, &cap, &size] (const char* d, size_t s)
      {
        memcpy (static_cast<char*> (buf_.data) + buf_.size, d, s);
        buf_.size += s;
        cap -= s;
        size -= s;
      };

      // Return the longest chunk of input that fits into the buffer and does
      // not end in the middle of a multi-byte UTF-8 sequence. Assume value
      // size and capacity are not 0. Return NULL in first if no chunk could
      // be found that fits into the remaining space. In this case, second is
      // the additional (to size) required space (used to handle escapes in
      // the checked version).
      //
      // The basic idea is to seek in the input buffer to the capacity of the
      // output buffer (unless the input is shorter than the output). If we
      // ended up in the middle of a multi-byte UTF-8 sequence, then seek back
      // until we end up at the UTF-8 sequence boundary. Note that this
      // implementation assumes valid UTF-8.
      //
      auto chunk = [&cap, &val] () -> pair<const char*, size_t>
      {
        pair<const char*, size_t> r (nullptr, 0);

        if (cap >= val.second)
          r = val;
        else
        {
          // Start from the character past capacity and search for a UTF-8
          // sequence boundary.
          //
          for (const char* p (val.first + cap); p != val.first; --p)
          {
            const auto u (static_cast<uint8_t> (*p));
            if (u < 0x80 || u > 0xBF) // Not a continuation byte
            {
              r = {val.first, p - val.first};
              break;
            }
          }
        }

        val.first += r.second;
        val.second -= r.second;

        return r;
      };

      // Escaping and UTF-8-validating version of chunk().
      //
      // There are three classes of mandatory escapes in a JSON string:
      //
      // - \\ and \"
      //
      // - \b \f \n \r \t for popular control characters
      //
      // - \u00NN for other control characters <= 0x1F
      //
      // If the input begins with a character that must be escaped, return
      // only its escape sequence. Otherwise validate and return everything up
      // to the end of input or buffer capacity, but cutting it short before
      // the next character that must be escaped or the first UTF-8 sequence
      // that would not fit.
      //
      // Return string::npos in second in case of a stray continuation byte or
      // any byte in an invalid UTF-8 range (for example, an "overlong" 2-byte
      // encoding of a 7-bit/ASCII character or a 4-, 5-, or 6-byte sequence
      // that would encode a codepoint beyond the U+10FFFF Unicode limit).
      //
      auto chunk_checked = [&cap, &size, &val] () -> pair<const char*, size_t>
      {
        pair<const char*, size_t> r (nullptr, 0);

        // Check whether the first character needs to be escaped.
        //
        const uint8_t c (val.first[0]);
        if (c == '"')
          r = {"\\\"", 2};
        else if (c == '\\')
          r = {"\\\\", 2};
        else if (c <= 0x1F)
        {
          auto s (json_escapes[c]);
          r = {s, s[1] == 'u' ? 6 : 2};
        }

        if (r.first != nullptr)
        {
          // Return in second the additional (to size) space required.
          //
          if (r.second > cap)
            return {nullptr, r.second - 1};

          // If we had to escape the character then adjust size accordingly
          // (see append() above).
          //
          size += r.second - 1;

          val.first += 1;
          val.second -= 1;
          return r;
        }

        // First character doesn't need to be escaped. Return as much of the
        // rest of the input as possible.
        //
        size_t i (0);
        for (size_t n (min (cap, val.second)); i != n; i++)
        {
          const uint8_t c1 (val.first[i]);

          if (c1 == '"' || c1 == '\\' || c1 <= 0x1F) // Needs to be escaped.
            break;
          else if (c1 >= 0x80) // Not ASCII, so validate as a UTF-8 sequence.
          {
            size_t i1 (i); // Position of the first byte.

            // The control flow here is to continue if valid and to fall
            // through to return on error.
            //
            if (c1 >= 0xC2 && c1 <= 0xDF) // 2-byte sequence.
            {
              if (i + 2 <= val.second) // Sequence is complete in JSON value.
              {
                if (i + 2 > cap) // Sequence won't fit.
                  break;

                const uint8_t c2 (val.first[++i]);

                if (c2 >= 0x80 && c2 <= 0xBF)
                  continue;
              }
            }
            else if (c1 >= 0xE0 && c1 <= 0xEF) // 3-byte sequence.
            {
              if (i + 3 <= val.second)
              {
                if (i + 3 > cap)
                  break;

                const uint8_t c2 (val.first[++i]), c3 (val.first[++i]);

                if (c3 >= 0x80 && c3 <= 0xBF)
                {
                  switch (c1)
                  {
                  case 0xE0: if (c2 >= 0xA0 && c2 <= 0xBF) continue; break;
                  case 0xED: if (c2 >= 0x80 && c2 <= 0x9F) continue; break;
                  default:   if (c2 >= 0x80 && c2 <= 0xBF) continue; break;
                  }
                }
              }
            }
            else if (c1 >= 0xF0 && c1 <= 0xF4) // 4-byte sequence.
            {
              if (i + 4 <= val.second)
              {
                if (i + 4 > cap)
                  break;

                const uint8_t c2 (val.first[++i]),
                              c3 (val.first[++i]),
                              c4 (val.first[++i]);

                if (c3 >= 0x80 && c3 <= 0xBF &&
                    c4 >= 0x80 && c4 <= 0xBF)
                {
                  switch (c1)
                  {
                  case 0xF0: if (c2 >= 0x90 && c2 <= 0xBF) continue; break;
                  case 0xF4: if (c2 >= 0x80 && c2 <= 0x8F) continue; break;
                  default:   if (c2 >= 0x80 && c2 <= 0xBF) continue; break;
                  }
                }
              }
            }

            r = {val.first, string::npos};

            // Update val to point to the beginning of the invalid sequence.
            //
            val.first += i1;
            val.second -= i1;

            return r;
          }
        }

        if (i != 0) // We have a chunk.
        {
          r = {val.first, i};

          val.first += i;
          val.second -= i;
        }

        return r;
      };

      // Value's original size (used to calculate the offset of the errant
      // character in case of a validation failure).
      //
      const size_t vn (val.second);

      // Write the separator, if any.
      //
      if (sep.second != 0)
      {
        if (cap < sep.second && !grow (sep.second))
          goto fail_nospace;

        append (sep.first, sep.second);
      }

      // Write the value's opening quote, if requested.
      //
      if (q != '\0')
      {
        if (cap == 0 && !grow (1))
          goto fail_nospace;

        append ("\"", 1);
      }

      // Write the value, unless empty.
      //
      while (val.second != 0)
      {
        pair<const char*, size_t> ch (nullptr, 0);

        if (cap != 0)
          ch = check ? chunk_checked () : chunk ();

        if (ch.first == nullptr)
        {
          // The minimum extra bytes we need the overflow function to be able
          // to provide is based on these sequences that we do not break:
          //
          // - 4 bytes for a UTF-8 sequence
          // - 6 bytes for an escaped Unicode sequence (\uXXXX).
          //
          if (!grow (6, ch.second))
            goto fail_nospace;
        }
        else if (ch.second != string::npos)
          append (ch.first, ch.second);
        else
          goto fail_utf8;
      }

      // Write the value's closing quote, if requested.
      //
      if (q != '\0')
      {
        if (cap == 0 && !grow (1))
          goto fail_nospace;

        append ("\"", 1);
      }

      return;

      // Note: keep descriptions consistent with the parser.
      //
    fail_utf8:
      throw invalid_json_output (e,
                                 e == event::name ? error_code::invalid_name
                                                  : error_code::invalid_value,
                                 "invalid UTF-8 text",
                                 vn - val.second);

    fail_nospace:
      throw invalid_json_output (
          e, error_code::buffer_overflow, "insufficient space in buffer");
    }

    size_t buffer_serializer::
    to_chars_impl (char* b, size_t n, const char* f, ...)
    {
      va_list a;
      va_start (a, f);
      const int r (vsnprintf (b, n, f, a));
      va_end (a);

      if (r < 0 || r >= static_cast<int> (n))
      {
        throw invalid_json_output (event::number,
                                   error_code::invalid_value,
                                   "unable to convert number to string");
      }

      return static_cast<size_t> (r);
    }
  }
}