From 5ae9686adac1508873f2d980e84becd3496244c2 Mon Sep 17 00:00:00 2001
From: Karen Arutyunov <karen@codesynthesis.com>
Date: Wed, 26 Feb 2020 17:16:45 +0300
Subject: Add notion of validator to char_scanner and make sure manifest is
 UTF-8

This involves implementing utf8_validator and UTF-8 utility functions and
using them during the manifest parsing, serialization, and rewriting.
---
 libbutl/utf8.cxx | 342 -------------------------------------------------------
 1 file changed, 342 deletions(-)
 delete mode 100644 libbutl/utf8.cxx

(limited to 'libbutl/utf8.cxx')
diff --git a/libbutl/utf8.cxx b/libbutl/utf8.cxx
deleted file mode 100644
index 0f24559..0000000
--- a/libbutl/utf8.cxx
+++ /dev/null
@@ -1,342 +0,0 @@
-// file      : libbutl/utf8.cxx -*- C++ -*-
-// license   : MIT; see accompanying LICENSE file
-
-#ifndef __cpp_modules_ts
-#include <libbutl/utility.mxx>
-#endif
-
-#ifndef __cpp_lib_modules_ts
-#include <string>
-#include <cstddef>
-
-#include <algorithm>    // lower_bound()
-#endif
-
-#ifdef __cpp_modules_ts
-module butl.utility;
-
-// Only imports additional to interface.
-#ifdef __clang__
-#ifdef __cpp_lib_modules_ts
-import std.core;
-import std.io;
-#endif
-#endif
-
-#endif
-
-namespace butl
-{
-  using namespace std;
-
-  // Sorted arrays of the Unicode codepoint ranges corresponding to the
-  // codepoint types. Note that code type range lists (but not ranges
-  // themselves) may overlap.
-  //
-  // Note that the graphic type codepoints are numerous and scattered. Thus,
-  // we will consider a codepoint to be of the graphic type if it is not of
-  // any other type.
-  //
-  using codepoint_range = pair<char32_t, char32_t>;
-
-  static const codepoint_range cn_rs[] = // Control.
-  {
-    {0x00, 0x1F},
-    {0x7F, 0x9F}
-  };
-
-  static const codepoint_range fr_rs[] = // Format.
-  {
-    {0x000AD, 0x000AD},
-    {0x00600, 0x00605},
-    {0x0061C, 0x0061C},
-    {0x006DD, 0x006DD},
-    {0x0070F, 0x0070F},
-    {0x008E2, 0x008E2},
-    {0x0180E, 0x0180E},
-    {0x0200B, 0x0200F},
-    {0x0202A, 0x0202E},
-    {0x02060, 0x02064},
-    {0x02066, 0x0206F},
-    {0x0FEFF, 0x0FEFF},
-    {0x0FFF9, 0x0FFFB},
-    {0x110BD, 0x110BD},
-    {0x110CD, 0x110CD},
-    {0x13430, 0x13438},
-    {0x1BCA0, 0x1BCA3},
-    {0x1D173, 0x1D17A},
-    {0xE0001, 0xE0001},
-    {0xE0020, 0xE007F}
-  };
-
-  static const codepoint_range pr_rs[] = // Private-use.
-  {
-    {0x00E000, 0x00F8FF},
-    {0x0F0000, 0x10FFFF}
-  };
-
-  static const codepoint_range nc_rs[] = // Non-character.
-  {
-    {0xFDD0, 0xFDEF}
-  };
-
-  static const codepoint_range rs_rs[] = // Reserved.
-  {
-    {0x30000, 0xE0000},
-    {0xE0002, 0xE001F},
-    {0xE0080, 0xE00FF},
-    {0xE01F0, 0xEFFFF}
-  };
-
-  struct codepoint_type_ranges
-  {
-    codepoint_types type;
-    const codepoint_range* begin;
-    const codepoint_range* end;
-  };
-
-  static const codepoint_type_ranges ct_ranges[] =
-  {
-    {
-      codepoint_types::control,
-      cn_rs,
-      cn_rs + sizeof (cn_rs) / sizeof (*cn_rs)
-    },
-    {
-      codepoint_types::format,
-      fr_rs,
-      fr_rs + sizeof (fr_rs) / sizeof (*fr_rs)
-    },
-    {
-      codepoint_types::private_use,
-      pr_rs,
-      pr_rs + sizeof (pr_rs) / sizeof (*pr_rs)
-    },
-    {
-      codepoint_types::non_character,
-      nc_rs,
-      nc_rs + sizeof (nc_rs) / sizeof (*nc_rs)
-    },
-    {
-      codepoint_types::reserved,
-      rs_rs,
-      rs_rs + sizeof (rs_rs) / sizeof (*rs_rs)
-    }
-  };
-
-  bool
-  utf8 (const string& s, codepoint_types ts, const char32_t* wl)
-  {
-    // A UCS-4 character is encoded as the UTF-8 byte sequence as follows,
-    // depending on the value range it falls into:
-    //
-    // 0x00000000 - 0x0000007F:
-    //   0xxxxxxx
-    //
-    // 0x00000080 - 0x000007FF:
-    //   110xxxxx 10xxxxxx
-    //
-    // 0x00000800 - 0x0000FFFF:
-    //   1110xxxx 10xxxxxx 10xxxxxx
-    //
-    // 0x00010000 - 0x001FFFFF:
-    //   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
-    //
-    // 0x00200000 - 0x03FFFFFF:
-    //   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
-    //
-    // 0x04000000 - 0x7FFFFFFF:
-    //   1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
-    //
-    // Also note that the Unicode Standard (as of 12.1) specifies no
-    // codepoints above 0x10FFFF, so we consider 5- and 6-byte length UTF-8
-    // sequences as invalid (we could have added `unspecified` codepoint type
-    // except that there are no UTF-8 validation tables defined for these
-    // sequences).
-    //
-    size_t n (s.size ());
-
-    for (size_t i (0); i != n; )
-    {
-      // Detect the UTF-8 byte sequence length based on its first byte. While
-      // at it, start calculating the Unicode codepoint value.
-      //
-      size_t sn;
-      char32_t c;
-      unsigned char b1 (s[i]);
-
-      if (b1 < 0x80)
-      {
-        sn = 1;
-        c  = b1;
-      }
-      else if (b1 < 0xE0)
-      {
-        sn = 2;
-        c  = b1 & 0x1F; // Takes 5 rightmost bits of the first sequence byte.
-      }
-      else if (b1 < 0xF0)
-      {
-        sn = 3;
-        c  = b1 & 0xF; // Takes 4 rightmost bits of the first sequence byte.
-      }
-      else if (b1 < 0xF8)
-      {
-        sn = 4;
-        c  = b1 & 0x7; // Takes 3 rightmost bits of the first sequence byte.
-      }
-      else
-        return false; // The byte starts 5- or 6-byte length sequence.
-
-      // Bail out if the string doesn't contain all the requred codepoint
-      // encoding bytes.
-      //
-      if (sn > n - i)
-        return false;
-
-      // Note that while a codepoint may potentially be encoded with byte
-      // sequences of different lengths, only the shortest encoding sequence
-      // is considered well-formed. Also a well-formed sequence may not be
-      // decoded into a UTF-16 surrogate value ([D800 DFFF]) or a value that
-      // is greater than the max codepoint value (0x10FFFF). We will check all
-      // that using the Well-Formed UTF-8 Byte Sequences table (provided by
-      // the Unicode 12.0 Standard) which also takes care of the missing UTF-8
-      // sequence bytes.
-      //
-      // Return true if a byte value belongs to the specified range.
-      //
-      auto belongs = [] (unsigned char c, unsigned char l, unsigned char r)
-      {
-        return c >= l && c <= r;
-      };
-
-      switch (sn)
-      {
-      case 1: break; // Always well-formed by the definition (see above).
-      case 2:
-        {
-          // [000080 0007FF]: [C2 DF]  [80 BF]
-          //
-          // Check the first/second bytes combinations:
-          //
-          if (!(belongs (b1, 0xC2, 0xDF) && belongs (s[i + 1], 0x80, 0xBF)))
-            return false;
-
-          break;
-        }
-      case 3:
-        {
-          // [000800 000FFF]: E0       [A0 BF]  [80 BF]
-          // [001000 00CFFF]: [E1 EC]  [80 BF]  [80 BF]
-          // [00D000 00D7FF]: ED       [80 9F]  [80 BF] ; Excludes surrogates.
-          // [00E000 00FFFF]: [EE EF]  [80 BF]  [80 BF]
-          //
-          unsigned char b2 (s[i + 1]);
-
-          if (!((b1 == 0xE0               && belongs (b2, 0xA0, 0xBF)) ||
-                (belongs (b1, 0xE1, 0xEC) && belongs (b2, 0x80, 0xBF)) ||
-                (b1 == 0xED               && belongs (b2, 0x80, 0x9F)) ||
-                (belongs (b1, 0xEE, 0xEF) && belongs (b2, 0x80, 0xBF))) ||
-              !belongs (s[i + 2], 0x80, 0xBF))
-            return false;
-
-          break;
-        }
-      case 4:
-        {
-          // [010000 03FFFF]: F0       [90 BF]  [80 BF]  [80 BF]
-          // [040000 0FFFFF]: [F1 F3]  [80 BF]  [80 BF]  [80 BF]
-          // [100000 10FFFF]: F4       [80 8F]  [80 BF]  [80 BF]
-          //
-          unsigned char b2 (s[i + 1]);
-
-          if (!((b1 == 0xF0               && belongs (b2, 0x90, 0xBF)) ||
-                (belongs (b1, 0xF1, 0xF3) && belongs (b2, 0x80, 0xBF)) ||
-                (b1 == 0xF4               && belongs (b2, 0x80, 0x8F))) ||
-              !belongs (s[i + 2], 0x80, 0xBF)                           ||
-              !belongs (s[i + 3], 0x80, 0xBF))
-            return false;
-
-          break;
-        }
-      }
-
-      // For the remaining sequence bytes, "append" their 6 rightmost bits to
-      // the resulting codepoint value.
-      //
-      --sn;
-      ++i;
-
-      for (size_t n (i + sn); i != n; ++i)
-        c = (c << 6) | (s[i] & 0x3F);
-
-      // Check the decoded codepoint, unless any codepoint type is allowed.
-      //
-      if (ts == codepoint_types::any)
-        continue;
-
-      using traits = u32string::traits_type;
-
-      // Check if the decoded codepoint is whitelisted.
-      //
-      if (wl != nullptr &&
-          traits::find (wl, traits::length (wl), c) != nullptr)
-        continue;
-
-      // Match the decoded codepoint type against the specified type set.
-      //
-      // Detect the codepoint type (see the Types of Code Points table in the
-      // Unicode 12.0 Standard for details).
-      //
-      codepoint_types ct;
-
-      // Optimize for the common case (printable ASCII characters).
-      //
-      if (c >= 0x20 && c <= 0x7E)
-        ct = codepoint_types::graphic;
-      else if ((c & 0xFFFF) >= 0xFFFE) // Non-range based detection.
-        ct = codepoint_types::non_character;
-      else
-      {
-        // Note that we consider a codepoint to be of the graphic type if it
-        // is not of any other type (see above).
-        //
-        ct = codepoint_types::graphic;
-
-        // Note that the codepoint type range lists may overlap. Thus, we
-        // iterate over all of them until there is a match.
-        //
-        for (size_t i (0); i != sizeof (ct_ranges) / sizeof (*ct_ranges); ++i)
-        {
-          const codepoint_type_ranges& rs (ct_ranges[i]);
-
-          // Find the range that either contains the codepoint or lays to the
-          // right of it. Note that here we assume a range to be less than a
-          // codepoint if it lays to the left of the codepoint.
-          //
-          const codepoint_range* r (
-            lower_bound (rs.begin, rs.end,
-                         c,
-                         [] (const codepoint_range& r, char32_t c)
-                         {
-                           return r.second < c;
-                         }));
-
-          if (r != rs.end && r->first <= c) // Contains the codepoint?
-          {
-            ct = rs.type;
-            break;
-          }
-        }
-      }
-
-      // Now check if the codepoint type matches the specified set. Note: also
-      // covers the `ts == codepoint_types::none` case.
-      //
-      if ((ct & ts) == codepoint_types::none)
-        return false;
-    }
-
-    return true;
-  }
-}
-- 
cgit v1.1